In [1]:
import sklearn
import pandas as pd
from sklearn.preprocessing import StandardScaler

#620150009
#Daunte Robertson

df = pd.read_csv('train.csv')

In [2]:
df.head()

Unnamed: 0,label,text,EmailID
0,Ham,eric called me last night and i am happy to co...,0
1,Spam,tr what is funnygadget com funnygadget com ca...,1
2,Ham,"ricky sent me the nom early - 87 , 000 for the...",2
3,Ham,these are the numbers that are being exported ...,3
4,Spam,authentic replica rolex and other watches for ...,4


In [3]:
df.isnull().values.any()

False

In [4]:
len(df)

46524

In [None]:
df.loc[df['label'] == 'Spam', 'label'] = 0
df.loc[df['label'] == 'Ham', 'label'] = 1

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer



# Preprocess text data
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def preprocess_text(text):
    # Tokenize text
    words = word_tokenize(text)
    # Remove stop words and apply stemming
    words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the 'Text' column
df['ProcessedText'] = df['text'].apply(preprocess_text)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X = tfidf_vectorizer.fit_transform(df['ProcessedText'])

In [7]:
from sklearn.model_selection import train_test_split

y = df['label']  # Target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
from sklearn.naive_bayes import MultinomialNB

# Create and train the model
model0 = MultinomialNB()
model0.fit(X_train, y_train)

In [9]:
from sklearn.metrics import accuracy_score, classification_report

# Make predictions
y_pred = model0.predict(X_test)

# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Generate classification report
print(classification_report(y_test, y_pred))


Accuracy: 0.9045674368619022
              precision    recall  f1-score   support

         Ham       0.90      0.93      0.91      4930
        Spam       0.92      0.88      0.90      4375

    accuracy                           0.90      9305
   macro avg       0.91      0.90      0.90      9305
weighted avg       0.90      0.90      0.90      9305



In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

model1 = LogisticRegression()
model1.fit(X_train, y_train)

In [11]:
y_pred = model1.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

In [12]:
accuracy

0.9455131649650725

In [13]:
precision

0.9457132817287724

In [14]:
from sklearn.tree import DecisionTreeClassifier

model2 = DecisionTreeClassifier()
model2.fit(X_train, y_train)

In [15]:
y_pred = model2.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

In [16]:
accuracy

0.9265986029016657

In [17]:
precision

0.9266384930981135

In [18]:
from sklearn.ensemble import RandomForestClassifier

model3 = RandomForestClassifier(n_estimators=100, random_state=42)
model3.fit(X_train, y_train)

In [19]:
y_pred = model3.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
conf_matrix = confusion_matrix(y_test, y_pred)

In [20]:
accuracy

0.9676518001074691

In [21]:
precision

0.967661686447862

In [22]:
test_df = pd.read_csv('test.csv')
test_df.isna().sum()

text       0
EmailID    0
dtype: int64

In [23]:
IDs = test_df['EmailID']
test_df

Unnamed: 0,text,EmailID
0,monika try calling ron heller escapenumber esc...,46524
1,"dear all ,\nattached is a paper describing the...",46525
2,arm inc e yo xual des spe ume reas ur se ire r...,46526
3,you are the man http ourmix hk,46527
4,kay mann enron com bmm b b b b b b b escapenum...,46528
...,...,...
11626,start date : 1 / 10 / 02 ; hourahead hour : 10...,58150
11627,customer service rep please contact customer p...,58151
11628,legal operating systems for a quarter of the p...,58152
11629,plan conference call escapenumberpm sunday dis...,58153


In [24]:
test_df['ProcessedText'] = test_df['text'].apply(preprocess_text)
X_test = tfidf_vectorizer.fit_transform(test_df['ProcessedText'])

In [30]:
y_pred_test0 = model0.predict(X_test)
y_pred_test1 = model1.predict(X_test)
y_pred_test2 = model2.predict(X_test)
y_pred_test3 = model3.predict(X_test)

In [31]:
y_pred_test0

array(['Spam', 'Ham', 'Ham', ..., 'Ham', 'Ham', 'Ham'], dtype='<U4')

In [35]:
y_pred_test1

array(['Spam', 'Ham', 'Ham', ..., 'Ham', 'Ham', 'Ham'], dtype=object)

In [36]:
y_pred_test2

array(['Ham', 'Spam', 'Ham', ..., 'Ham', 'Ham', 'Spam'], dtype=object)

In [37]:
y_pred_test3

array(['Ham', 'Ham', 'Ham', ..., 'Ham', 'Ham', 'Ham'], dtype=object)

In [38]:
df_submission0 = pd.DataFrame({'EmailID': IDs, 'label': y_pred_test0})
df_submission1 = pd.DataFrame({'EmailID': IDs, 'label': y_pred_test1})
df_submission2 = pd.DataFrame({'EmailID': IDs, 'label': y_pred_test2})
df_submission3 = pd.DataFrame({'EmailID': IDs, 'label': y_pred_test3})

In [39]:
df_submission0

Unnamed: 0,EmailID,label
0,46524,Spam
1,46525,Ham
2,46526,Ham
3,46527,Ham
4,46528,Spam
...,...,...
11626,58150,Ham
11627,58151,Ham
11628,58152,Ham
11629,58153,Ham


In [40]:
df_submission0.to_csv('submission0.csv', index=False)
df_submission1.to_csv('submission1.csv', index=False)
df_submission2.to_csv('submission2.csv', index=False)
df_submission3.to_csv('submission3.csv', index=False)