# Implementing Pipeline on Larger Data

In [7]:
# Connecting to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
# Loading the data
file_path = "/content/drive/MyDrive/Datasets/spam.csv"

import pandas as pd

# Working around the encoding error while loading the dataset
encodings = ["utf-8", "latin1", "iso-8859-1", "utf-16"]

for encoding in encodings:
    try:
        data = pd.read_csv(file_path, encoding=encoding)
        print("File read successfully using encoding:", encoding)
        break  # Exit the loop if the file is read successfully
    except UnicodeDecodeError:
        print("Failed to read with encoding:", encoding)

Failed to read with encoding: utf-8
File read successfully using encoding: latin1


In [9]:
# Renaming the columns
data = data.rename(columns={'v1': 'label', 'v2': 'message'})

# Replace with binary
data['label'] = data['label'].replace({'ham': 1, 'spam': 0})

# Dropping unnecessary columns
data.drop(columns = ['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace = True)

# Viewing the data
data.head

<bound method NDFrame.head of       label                                            message
0         1  Go until jurong point, crazy.. Available only ...
1         1                      Ok lar... Joking wif u oni...
2         0  Free entry in 2 a wkly comp to win FA Cup fina...
3         1  U dun say so early hor... U c already then say...
4         1  Nah I don't think he goes to usf, he lives aro...
...     ...                                                ...
5567      0  This is the 2nd time we have tried 2 contact u...
5568      1              Will Ì_ b going to esplanade fr home?
5569      1  Pity, * was in mood for that. So...any other s...
5570      1  The guy did some bitching but I acted like i'd...
5571      1                         Rofl. Its true to its name

[5572 rows x 2 columns]>

In [13]:
# Importing libraries
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
import xgboost as xgb

In [21]:
# Split data into train and test sets
train_data, test_data = train_test_split(data, test_size=0.6, random_state=42, stratify=data['label'])

In [22]:
# Using the best performing pipeline
pipeline_1 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', xgb.XGBClassifier())
])

# Define the parameter grid
param_grid = {
    'tfidf__max_features': [1000, 2000, 3000],
    'clf__learning_rate': [0.01, 0.1, 0.2],
    'clf__max_depth': [3, 5, 7],
    'clf__n_estimators': [100, 200, 300]
}

# Create randomized search
random_search_1 = RandomizedSearchCV(estimator=pipeline_1, param_distributions=param_grid, n_iter=10, cv=5, random_state=42)

In [23]:
# Fit the randomized search to find the best model
random_search_1.fit(train_data['message'], train_data['label'])

# Get the best model from the random search
best_model_1 = random_search_1.best_estimator_

# Make predictions using the best model
predictions_1 = best_model_1.predict(test_data['message'])

# Now you can evaluate the predictions using the appropriate evaluation metrics
precision_1 = precision_score(test_data['label'], predictions_1, pos_label=0)
recall_1 = recall_score(test_data['label'], predictions_1, pos_label=0)
f1_1 = f1_score(test_data['label'], predictions_1, pos_label=0)

# Create a classification report
classification_report_1 = classification_report(test_data['label'], predictions_1)

In [24]:
# Print evaluation metrics
print("Pipeline 1:")
print(f"Precision: {precision_1}")
print(f"Recall: {recall_1}")
print(f"F1 Score: {f1_1}")
print(classification_report_1)

Pipeline 1:
Precision: 0.9413333333333334
Recall: 0.7879464285714286
F1 Score: 0.8578371810449574
              precision    recall  f1-score   support

           0       0.94      0.79      0.86       448
           1       0.97      0.99      0.98      2896

    accuracy                           0.97      3344
   macro avg       0.95      0.89      0.92      3344
weighted avg       0.96      0.97      0.96      3344

