In [67]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline #data flowing pannel
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score
from sklearn.preprocessing import LabelBinarizer

In [95]:
pipeline=Pipeline([
    ('vect',TfidfVectorizer(stop_words='english')),
    ('clf',LogisticRegression())
])

In [112]:
parameters={
    'vect__max_df':(0.25,0.5,0.75),
    'vect__stop_words':('english',None),
    'vect__ngram_range':((1,1),(1,2)),
    'vect__use_idf':(True,False),
    'vect__max_features':(2500,5000,10000),
    'vect__norm':('l1','l2'),
    'clf__penalty':('l1','l2'),
    'clf__C':(0.01,0.1,1,10) # regularization parameter:strength of penalty
}

In [113]:
grid_search=GridSearchCV(estimator=pipeline,param_grid=parameters,n_jobs=-1,verbose=1,cv=3,scoring='accuracy')

In [114]:
df=pd.read_csv(r"C:\Users\archa\OneDrive\Desktop\Downloads\SMSSpamCollection",sep='\t',header=None)
df

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [115]:
x,y=df.iloc[:,1],df.iloc[:,0]
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: 0, Length: 5572, dtype: object

In [116]:
lb=LabelBinarizer()
y=lb.fit_transform(y)

In [117]:
y=np.ravel(y) #to convert multidimensional array to 1D
y

array([0, 0, 1, ..., 0, 0, 0])

In [118]:
xtrain,xtest,ytrain,ytest=train_test_split(x,y)

In [119]:
grid_search.fit(xtrain,ytrain)

Fitting 3 folds for each of 1152 candidates, totalling 3456 fits


1728 fits failed out of a total of 3456.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1728 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\archa\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\archa\anaconda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\archa\anaconda3\Lib\site-packages\sklearn\pipeline.py", line 420, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "C:\Users\archa\anaconda3\Lib\site-packages\sklearn\base.py",

In [120]:
print('best score',grid_search.best_score_)

best score 0.9839674563292654


In [121]:
best_parameter=grid_search.best_estimator_.get_params()
best_parameter

{'memory': None,
 'steps': [('vect',
   TfidfVectorizer(max_df=0.25, max_features=5000, ngram_range=(1, 2))),
  ('clf', LogisticRegression(C=10))],
 'verbose': False,
 'vect': TfidfVectorizer(max_df=0.25, max_features=5000, ngram_range=(1, 2)),
 'clf': LogisticRegression(C=10),
 'vect__analyzer': 'word',
 'vect__binary': False,
 'vect__decode_error': 'strict',
 'vect__dtype': numpy.float64,
 'vect__encoding': 'utf-8',
 'vect__input': 'content',
 'vect__lowercase': True,
 'vect__max_df': 0.25,
 'vect__max_features': 5000,
 'vect__min_df': 1,
 'vect__ngram_range': (1, 2),
 'vect__norm': 'l2',
 'vect__preprocessor': None,
 'vect__smooth_idf': True,
 'vect__stop_words': None,
 'vect__strip_accents': None,
 'vect__sublinear_tf': False,
 'vect__token_pattern': '(?u)\\b\\w\\w+\\b',
 'vect__tokenizer': None,
 'vect__use_idf': True,
 'vect__vocabulary': None,
 'clf__C': 10,
 'clf__class_weight': None,
 'clf__dual': False,
 'clf__fit_intercept': True,
 'clf__intercept_scaling': 1,
 'clf__l1_rati

In [123]:
for param_name in parameters.keys():
    print(param_name,":",best_parameter[param_name])

vect__max_df : 0.25
vect__stop_words : None
vect__ngram_range : (1, 2)
vect__use_idf : True
vect__max_features : 5000
vect__norm : l2
clf__penalty : l2
clf__C : 10


In [125]:
vectoriser=TfidfVectorizer(max_df=0.25,stop_words='english',max_features=5000,ngram_range=(1,2),use_idf=True,norm='l2')
xtrain=vectoriser.fit_transform(xtrain)
xtest=vectoriser.transform(xtest)

In [126]:
ytrain

array([0, 0, 1, ..., 0, 0, 0])

In [127]:
clf=LogisticRegression(C=10)
clf.fit(xtrain,ytrain)

In [128]:
pred=clf.predict(xtest)

In [129]:
print('accuracy score',accuracy_score(ytest,pred))
print('recall',recall_score(ytest,pred))
print('precision',precision_score(ytest,pred))
print('f1_score',f1_score(ytest,pred))

accuracy score 0.9798994974874372
recall 0.885
precision 0.9725274725274725
f1_score 0.9267015706806283
