In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data\spam.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
ham = df[df['label']=='ham']
spam = df[df['label']=='spam']

In [5]:
ham.shape, spam.shape

((4825, 4), (747, 4))

In [6]:
ham = ham.sample(spam.shape[0])

In [7]:
data = pd.concat([ham, spam], ignore_index=True)

In [8]:
data.head()

Unnamed: 0,label,message,length,punct
0,ham,Just sing HU. I think its also important to fi...,251,4
1,ham,Yetunde i'm in class can you not run water on ...,72,3
2,ham,Hello darlin ive finished college now so txt m...,84,0
3,ham,R we still meeting 4 dinner tonight?,36,1
4,ham,Your board is working fine. The issue of overh...,135,5


In [9]:
data['label'].value_counts()

ham     747
spam    747
Name: label, dtype: int64

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
 data.rename(columns={'label':'Target','message':'Text'},inplace=True)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(data['Text'],data['Target'], test_size=0.3, random_state=0)

In [13]:
train_set = pd.concat([X_train, y_train], axis=1)
test_set = pd.concat([X_test, y_test], axis=1)

In [14]:
train_set.to_csv('train.csv', index=False, header=True)
test_set.to_csv('test.csv', index=False, header=True)

In [15]:
train_df=pd.read_csv('train.csv')
test_df=pd.read_csv('test.csv')

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tfidf = TfidfVectorizer()

In [18]:
target_column_name="Target"

In [19]:
X_train = train_df.drop(columns=[target_column_name],axis=1).iloc[:,0]
y_train = train_df[target_column_name]

X_test=test_df.drop(columns=[target_column_name],axis=1).iloc[:,0]
y_test=test_df[target_column_name]

In [20]:
input_feature_train_arr=tfidf.fit_transform(X_train)
input_feature_test_arr=tfidf.transform(X_test)

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [23]:
models = {
                "Random Forest": RandomForestClassifier(),
                "SVM": SVC()
            }

In [24]:
params={
                "Random Forest":{
                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
                 
                    # 'max_features':['sqrt','log2',None],
                    'n_estimators': [10]
                },
                "SVM":{
                    'C': [100],
                    'gamma':['auto']
                }
            }

In [26]:
model = list(models.values())[0]
para = params[list(models.keys())[0]]

In [27]:
model, para

(RandomForestClassifier(), {'n_estimators': [10]})

In [28]:
from sklearn.model_selection import GridSearchCV

In [29]:
gs = GridSearchCV(model, para, cv=3)

In [30]:
gs

In [32]:
gs.fit(input_feature_train_arr, y_train)

In [33]:
model.set_params(**gs.best_params_)

In [34]:
model.fit(input_feature_train_arr, y_train)

In [35]:
y_train_predict = model.predict(input_feature_train_arr)

In [36]:
y_test_predict = model.predict(input_feature_test_arr)

In [37]:
accuracy_score(y_train, y_train_predict)

0.9971291866028709

In [38]:
accuracy_score(y_test, y_test_predict)

0.9398663697104677