In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import spacy

In [3]:
nlp = spacy.load('en_core_web_lg')

In [4]:
##loading the data
df_train =pd.read_csv('Train.csv')
df_test = pd.read_csv('Test.csv')

In [5]:
##displaying the data
df_train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [6]:
##checking the distribution of the agreement
df_train['agreement'].value_counts()

1.000000    5866
0.666667    3894
0.333333     239
Name: agreement, dtype: int64

In [7]:
##checking the distribution of the label
df_train['label'].value_counts()

 0.000000    4908
 1.000000    4053
-1.000000    1038
 0.666667       1
Name: label, dtype: int64

In [8]:
df_train_modified=df_train[df_train['label'].values==0.666667].copy()

In [9]:
len(df_train_modified)

0

In [10]:
df_train_modified['label'].value_counts()

Series([], Name: label, dtype: int64)

In [11]:
##checking the length
len(df_train)

10001

In [12]:
df_test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [13]:
list(df_train.itertuples())

[Pandas(Index=0, tweet_id='CL1KWCMY', safe_text='Me &amp; The Big Homie meanboy3000 #MEANBOY #MB #MBS #MMR #STEGMANLIFE @ Stegman St. <url>', label=0.0, agreement=1.0),
 Pandas(Index=1, tweet_id='E3303EME', safe_text="I'm 100% thinking of devoting my career to proving autism isn't caused by vaccines due to the IDIOTIC posts I've seen about World Autism Day", label=1.0, agreement=1.0),
 Pandas(Index=2, tweet_id='M4IVFSMS', safe_text='#whatcausesautism VACCINES, DO NOT VACCINATE YOUR CHILD', label=-1.0, agreement=1.0),
 Pandas(Index=3, tweet_id='1DR6ROZ4', safe_text="I mean if they immunize my kid with something that won't secretly kill him years down the line then I'm all for it, but I don't trust that", label=-1.0, agreement=1.0),
 Pandas(Index=4, tweet_id='J77ENIIE', safe_text='Thanks to <user> Catch me performing at La Nuit NYC 1134 1st ave. Show starts at 6! #jennifair #mmr… <url>', label=0.0, agreement=1.0),
 Pandas(Index=5, tweet_id='OVNPOAUX', safe_text='<user> a nearly 67 year o

In [14]:
##checking blank messages
blank_index = [i[0] for i in df_train.itertuples() if i[2].isspace()]
blank_index

[]

In [15]:
##checking null values on the train
df_train.isnull().sum()

tweet_id     0
safe_text    0
label        1
agreement    2
dtype: int64

In [16]:
##deleting null values
df_train.dropna(inplace=True)

In [17]:
df_train.isnull().sum()

tweet_id     0
safe_text    0
label        0
agreement    0
dtype: int64

In [18]:
##spitting the data into traing and validation set
X = df_train['safe_text']
y = df_train['label']

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC,SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
lsv = Pipeline([('vec',TfidfVectorizer()),('clf',LinearSVC())])
sv = Pipeline([('vec',TfidfVectorizer()),('clf',SVC())])
lg = Pipeline([('vec',TfidfVectorizer()),('clf',LogisticRegression(max_iter=10000,class_weight='balanced'))])
mnb = Pipeline([('vec',TfidfVectorizer()),('clf',MultinomialNB())])
xgb = Pipeline([('vec',TfidfVectorizer()),('clf',XGBClassifier())])
rf = Pipeline([('vec',TfidfVectorizer()),('clf',RandomForestClassifier())])

In [22]:
##training all the models
def fit_all(lst,X,y):
    lis = []
    for pip in lst:
        pip.fit(X,y)
        lis.append(pip)
    return lis

In [23]:
trained = fit_all([lsv,sv,lg,mnb,xgb,rf],X_train,y_train)

In [24]:
trained

[Pipeline(steps=[('vec', TfidfVectorizer()), ('clf', LinearSVC())]),
 Pipeline(steps=[('vec', TfidfVectorizer()), ('clf', SVC())]),
 Pipeline(steps=[('vec', TfidfVectorizer()),
                 ('clf',
                  LogisticRegression(class_weight='balanced', max_iter=10000))]),
 Pipeline(steps=[('vec', TfidfVectorizer()), ('clf', MultinomialNB())]),
 Pipeline(steps=[('vec', TfidfVectorizer()),
                 ('clf',
                  XGBClassifier(base_score=0.5, booster='gbtree',
                                colsample_bylevel=1, colsample_bynode=1,
                                colsample_bytree=1, gamma=0, gpu_id=-1,
                                importance_type='gain',
                                interaction_constraints='',
                                learning_rate=0.300000012, max_delta_step=0,
                                max_depth=6, min_child_weight=1, missing=nan,
                                monotone_constraints='()', n_estimators=100,
              

In [25]:
##making predictions for each trained model

##LinearSV
lsvc_pred= trained[0].predict(X_test)
##SVC
svc_pred= trained[1].predict(X_test)
##Logistic
lg_pred= trained[2].predict(X_test)
##MultinomialNB
mnb_pred= trained[3].predict(X_test)
##Xgboost
xgb_pred = trained[4].predict(X_test)
##Random forest
rf_pred = trained[5].predict(X_test)

In [26]:
from sklearn.metrics import classification_report,mean_squared_error

In [27]:
##evaluation of the models

print('LinearSV')
print(classification_report(y_test,lsvc_pred))
print('\n\n')
print('SVC')
print(classification_report(y_test,svc_pred))
print('\n\n')
print('LOgistc')
print(classification_report(y_test,lg_pred))
print('\n\n')
print('Multinomial')
print(classification_report(y_test,mnb_pred))
print('\n\n')
print('Xgboot')
print(classification_report(y_test,xgb_pred))
print('\n\n')
print('Random forest')
print(classification_report(y_test,rf_pred))

LinearSV
              precision    recall  f1-score   support

        -1.0       0.58      0.30      0.39       240
         0.0       0.77      0.80      0.79       943
         1.0       0.69      0.76      0.72       817

    accuracy                           0.72      2000
   macro avg       0.68      0.62      0.63      2000
weighted avg       0.72      0.72      0.71      2000




SVC
              precision    recall  f1-score   support

        -1.0       0.85      0.14      0.24       240
         0.0       0.78      0.85      0.81       943
         1.0       0.69      0.78      0.73       817

    accuracy                           0.73      2000
   macro avg       0.77      0.59      0.59      2000
weighted avg       0.75      0.73      0.71      2000




LOgistc
              precision    recall  f1-score   support

        -1.0       0.42      0.58      0.49       240
         0.0       0.83      0.79      0.81       943
         1.0       0.73      0.69      0.71     

In [28]:
import numpy as np

print('LinearSV')
print(np.sqrt(mean_squared_error(y_test,lsvc_pred)))
print('\n\n')
print('SVC')
print(np.sqrt(mean_squared_error(y_test,svc_pred)))
print('\n\n')
print('LOgistc')
print(np.sqrt(mean_squared_error(y_test,lg_pred)))
print('\n\n')
print('Multinomial')
print(np.sqrt(mean_squared_error(y_test,mnb_pred)))
print('\n\n')
print('Xgboot')
print(np.sqrt(mean_squared_error(y_test,xgb_pred)))
print('\n\n')
print('Random forest')
print(np.sqrt(mean_squared_error(y_test,rf_pred)))

LinearSV
0.700713921654194



SVC
0.7042726744663603



LOgistc
0.7592759709091287



Multinomial
0.7443117626371358



Xgboot
0.7099295739719539



Random forest
0.7276675064890558


In [29]:
from sklearn.model_selection import GridSearchCV

##hyperparameter tunning

In [30]:
param_grid = {
    'clf__C': np.logspace(0, 2)
}
search = GridSearchCV(lg, param_grid, n_jobs=-1)
search.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf',
                                        LogisticRegression(class_weight='balanced',
                                                           max_iter=10000))]),
             n_jobs=-1,
             param_grid={'clf__C': array([  1.        ,   1.09854114,   1.20679264,   1.32571137,
         1.45634848,   1.59985872,   1.75751062,   1.93069773,
         2.12095089,   2.32995181,   2.55954792,   2.8117687 ,
         3.0888436 ,   3.39322177,   3.72759372,   4.09491506,
         4.49843267,   4...2,
         6.55128557,   7.19685673,   7.90604321,   8.68511374,
         9.54095476,  10.48113134,  11.51395399,  12.64855217,
        13.89495494,  15.26417967,  16.76832937,  18.42069969,
        20.23589648,  22.22996483,  24.42053095,  26.82695795,
        29.47051703,  32.37457543,  35.56480306,  39.06939937,
        42.9193426 ,  47.14866363,  51.79474679,  56.89866029,
    

In [31]:
search.best_params_

{'clf__C': 4.094915062380425}

In [32]:
predic = search.best_estimator_.predict(X_test)

In [33]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,predic))}")

The root mean square error is 0.7483314773547883


In [36]:
df_test.dropna(inplace=True)

In [37]:
X_val = df_test['safe_text']

In [38]:
df_test['label']=search.best_estimator_.predict(X_val)

In [39]:
submision5 = df_test[['tweet_id','label']]

In [40]:
submision5.to_csv('submision5.csv',index=False)

In [41]:
##making prediction on the test set

In [42]:
df_test.dropna(inplace=True)

In [43]:
X_val = df_test['safe_text']


In [44]:
df_test['label']=lg.predict(X_val)

In [45]:
df_test.head()

Unnamed: 0,tweet_id,safe_text,label
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,-1.0
1,00UNMD0E,Students starting school without whooping coug...,1.0
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0.0
3,01HOEQJW,How many innocent children die for lack of vac...,1.0
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0.0


In [47]:
submision = df_test[['tweet_id','label']]

In [48]:
submision.to_csv('submision1.csv',index=False)

In [49]:
################################################### working on submission2 ###############################
lg2 = Pipeline([('vec',TfidfVectorizer()),('clf',LogisticRegression(multi_class="multinomial"))])

In [50]:
lg2.fit(X_train,y_train)

Pipeline(steps=[('vec', TfidfVectorizer()),
                ('clf', LogisticRegression(multi_class='multinomial'))])

In [51]:
lg2_pred=lg2.predict(X_test)

In [52]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,lg2_pred))}")

The root mean square error is 0.6902897942168926


In [53]:
df1 = df_train.copy()

In [54]:
import string
from nltk.corpus import stopwords

In [55]:
from nltk.tokenize import RegexpTokenizer

In [56]:
##removing puntuations
def remove_punc(string1):
    tokenizer = RegexpTokenizer(r'\w+')
    return ' '.join(tokenizer.tokenize(string1))
     
    

In [57]:
df1["no_puct"]=df1['safe_text'].apply(remove_punc)

In [58]:
df1.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,no_puct
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0,Me amp The Big Homie meanboy3000 MEANBOY MB MB...
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0,I m 100 thinking of devoting my career to prov...
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0,whatcausesautism VACCINES DO NOT VACCINATE YOU...
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0,I mean if they immunize my kid with something ...
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0,Thanks to user Catch me performing at La Nuit ...


In [59]:
##converting to lowercase
def lower_case(string1):
    return string1.lower()

In [60]:
lower_case("MARUMO ABEL")

'marumo abel'

In [61]:
df1["lower_case"]=df1['no_puct'].apply(lower_case)

In [62]:
df1.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,no_puct,lower_case
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0,Me amp The Big Homie meanboy3000 MEANBOY MB MB...,me amp the big homie meanboy3000 meanboy mb mb...
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0,I m 100 thinking of devoting my career to prov...,i m 100 thinking of devoting my career to prov...
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0,whatcausesautism VACCINES DO NOT VACCINATE YOU...,whatcausesautism vaccines do not vaccinate you...
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0,I mean if they immunize my kid with something ...,i mean if they immunize my kid with something ...
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0,Thanks to user Catch me performing at La Nuit ...,thanks to user catch me performing at la nuit ...


In [63]:
###removing numbers
def remove_numbers(string1):
    return ''.join(i for i in string1 if i not in '0123456789')

In [64]:
remove_numbers('marumo1 abel2')

'marumo abel'

In [65]:
df1['no_numbers']=df1['lower_case'].apply(remove_numbers)

In [66]:
df1.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,no_puct,lower_case,no_numbers
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0,Me amp The Big Homie meanboy3000 MEANBOY MB MB...,me amp the big homie meanboy3000 meanboy mb mb...,me amp the big homie meanboy meanboy mb mbs mm...
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0,I m 100 thinking of devoting my career to prov...,i m 100 thinking of devoting my career to prov...,i m thinking of devoting my career to proving...
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0,whatcausesautism VACCINES DO NOT VACCINATE YOU...,whatcausesautism vaccines do not vaccinate you...,whatcausesautism vaccines do not vaccinate you...
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0,I mean if they immunize my kid with something ...,i mean if they immunize my kid with something ...,i mean if they immunize my kid with something ...
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0,Thanks to user Catch me performing at La Nuit ...,thanks to user catch me performing at la nuit ...,thanks to user catch me performing at la nuit ...


In [67]:
##spiling the data
X1 = df1['no_numbers']
y1 = df1['label']

In [68]:
 X_train1, X_test1, y_train1, y_test1= train_test_split(X1, y1, test_size=0.2, random_state=42)

In [69]:
lsv1 = Pipeline([('vec',TfidfVectorizer()),('clf',LinearSVC())])
sv1 = Pipeline([('vec',TfidfVectorizer()),('clf',SVC())])
lg1 = Pipeline([('vec',TfidfVectorizer()),('clf',LogisticRegression(multi_class='multinomial'))])
mnb1 = Pipeline([('vec',TfidfVectorizer()),('clf',MultinomialNB())])
xgb1 = Pipeline([('vec',TfidfVectorizer()),('clf',XGBClassifier())])
rf1 = Pipeline([('vec',TfidfVectorizer()),('clf',RandomForestClassifier())])

In [70]:
param_grid = {
    'clf__C': np.logspace(0, 2)
}
search1 = GridSearchCV(lg, param_grid, n_jobs=-1)
search1.fit(X_train1, y_train1)

GridSearchCV(estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf',
                                        LogisticRegression(class_weight='balanced',
                                                           max_iter=10000))]),
             n_jobs=-1,
             param_grid={'clf__C': array([  1.        ,   1.09854114,   1.20679264,   1.32571137,
         1.45634848,   1.59985872,   1.75751062,   1.93069773,
         2.12095089,   2.32995181,   2.55954792,   2.8117687 ,
         3.0888436 ,   3.39322177,   3.72759372,   4.09491506,
         4.49843267,   4...2,
         6.55128557,   7.19685673,   7.90604321,   8.68511374,
         9.54095476,  10.48113134,  11.51395399,  12.64855217,
        13.89495494,  15.26417967,  16.76832937,  18.42069969,
        20.23589648,  22.22996483,  24.42053095,  26.82695795,
        29.47051703,  32.37457543,  35.56480306,  39.06939937,
        42.9193426 ,  47.14866363,  51.79474679,  56.89866029,
    

In [71]:
pred1=search1.best_estimator_.predict(X_test1)

In [72]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test1,pred1))}")

The root mean square error is 0.7503332592921628


In [73]:
trained1 = fit_all([lsv1,sv1,lg1,mnb1,xgb1,rf1],X_train1,y_train1)

In [75]:
pred1 = search.best_estimator_.predict(X_test1)

In [76]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,lg2_pred))}")

The root mean square error is 0.6902897942168926


In [77]:
##making predictions for each trained model

##LinearSV
lsvc1_pred= trained1[0].predict(X_test1)
##SVC
svc1_pred= trained1[1].predict(X_test1)
##Logistic
lg1_pred= trained1[2].predict(X_test1)
##MultinomialNB
mnb1_pred= trained1[3].predict(X_test1)
##Xgboost
xgb1_pred = trained1[4].predict(X_test1)
##Random forest
rf1_pred = trained1[5].predict(X_test1)

In [78]:
print('LinearSV')
print(np.sqrt(mean_squared_error(y_test1,lsvc1_pred)))
print('\n\n')
print('SVC')
print(np.sqrt(mean_squared_error(y_test1,svc1_pred)))
print('\n\n')
print('LOgistc')
print(np.sqrt(mean_squared_error(y_test1,lg1_pred)))
print('\n\n')
print('Multinomial')
print(np.sqrt(mean_squared_error(y_test1,mnb1_pred)))
print('\n\n')
print('Xgboot')
print(np.sqrt(mean_squared_error(y_test1,xgb1_pred)))
print('\n\n')
print('Random forest')
print(np.sqrt(mean_squared_error(y_test1,rf1_pred)))

LinearSV
0.6928203230275509



SVC
0.7060453243241541



LOgistc
0.6855654600401044



Multinomial
0.7439758060582347



Xgboot
0.722841614740048



Random forest
0.7300684899377592


In [79]:
##preparing the test set
df2 =df_test.copy()

In [80]:
##df2.drop('label',axis=1,inplace=True)

In [81]:
df2.head()

Unnamed: 0,tweet_id,safe_text,label
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,-1.0
1,00UNMD0E,Students starting school without whooping coug...,1.0
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0.0
3,01HOEQJW,How many innocent children die for lack of vac...,1.0
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0.0


In [82]:
##remove punc
df2['no_punc']=df2['safe_text'].apply(remove_punc)

In [83]:
df2.head()

Unnamed: 0,tweet_id,safe_text,label,no_punc
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,-1.0,user user amp 4 a vaccine given 2 healthy peep...
1,00UNMD0E,Students starting school without whooping coug...,1.0,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0.0,I m kinda over every ep of user being ripped f...
3,01HOEQJW,How many innocent children die for lack of vac...,1.0,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0.0,CDC eyeing bird flu vaccine for humans though ...


In [84]:
df2['lower_case']=df2['no_punc'].apply(lower_case)

In [85]:
df2.head()

Unnamed: 0,tweet_id,safe_text,label,no_punc,lower_case
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,-1.0,user user amp 4 a vaccine given 2 healthy peep...,user user amp 4 a vaccine given 2 healthy peep...
1,00UNMD0E,Students starting school without whooping coug...,1.0,Students starting school without whooping coug...,students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0.0,I m kinda over every ep of user being ripped f...,i m kinda over every ep of user being ripped f...
3,01HOEQJW,How many innocent children die for lack of vac...,1.0,How many innocent children die for lack of vac...,how many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0.0,CDC eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine for humans though ...


In [86]:
df2['no_numbers']=df2['lower_case'].apply(remove_numbers)

In [87]:
df2.head()

Unnamed: 0,tweet_id,safe_text,label,no_punc,lower_case,no_numbers
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,-1.0,user user amp 4 a vaccine given 2 healthy peep...,user user amp 4 a vaccine given 2 healthy peep...,user user amp a vaccine given healthy peeps ...
1,00UNMD0E,Students starting school without whooping coug...,1.0,Students starting school without whooping coug...,students starting school without whooping coug...,students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0.0,I m kinda over every ep of user being ripped f...,i m kinda over every ep of user being ripped f...,i m kinda over every ep of user being ripped f...
3,01HOEQJW,How many innocent children die for lack of vac...,1.0,How many innocent children die for lack of vac...,how many innocent children die for lack of vac...,how many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0.0,CDC eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine for humans though ...


In [88]:
X_val1 = df2['no_numbers']

In [89]:
##df2['label']=lg1.predict(X_val)

In [90]:
submission2 = df2[['tweet_id','label']]

In [91]:
submission2.head()

Unnamed: 0,tweet_id,label
0,00BHHHP1,-1.0
1,00UNMD0E,1.0
2,01AXPTJF,0.0
3,01HOEQJW,1.0
4,01JUKMAO,0.0


In [92]:
submission2.to_csv('submission2.csv',index=False)

In [93]:
df2['label']=lsv1.predict(X_val)

In [94]:
submission3 = df2[['tweet_id','label']]

In [95]:
submission3.to_csv('submission3.csv',index=False)

In [96]:
################### working on submision 3 #################################################
##romoving stopwords

    

In [97]:
stop=stopwords.words('english')

In [98]:
def remove_stop(string1):
    totenizer=RegexpTokenizer(r'\w+')
    
    return ' '.join(i for i in totenizer.tokenize(string1) if i not in stop)

In [99]:
remove_stop('i love playing with weights')

'love playing weights'

In [100]:
df1['no_stop'] = df1['no_numbers'].apply(remove_stop)

In [101]:
X2 = df1['no_stop']
y2 = df1['label']

In [102]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

In [103]:
lsv2 = Pipeline([('vec',TfidfVectorizer()),('clf',LinearSVC())])
sv2 = Pipeline([('vec',TfidfVectorizer()),('clf',SVC())])
lg2 = Pipeline([('vec',TfidfVectorizer()),('clf',LogisticRegression(max_iter=1000))])
mnb2 = Pipeline([('vec',TfidfVectorizer()),('clf',MultinomialNB())])
xgb2 = Pipeline([('vec',TfidfVectorizer()),('clf',XGBClassifier())])
rf2 = Pipeline([('vec',TfidfVectorizer()),('clf',RandomForestClassifier())])

In [104]:
param_grid = {
    'clf__C': np.logspace(0, 2)
}
search2 = GridSearchCV(lg, param_grid, n_jobs=-1)
search2.fit(X_train2, y_train2)

GridSearchCV(estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf',
                                        LogisticRegression(class_weight='balanced',
                                                           max_iter=10000))]),
             n_jobs=-1,
             param_grid={'clf__C': array([  1.        ,   1.09854114,   1.20679264,   1.32571137,
         1.45634848,   1.59985872,   1.75751062,   1.93069773,
         2.12095089,   2.32995181,   2.55954792,   2.8117687 ,
         3.0888436 ,   3.39322177,   3.72759372,   4.09491506,
         4.49843267,   4...2,
         6.55128557,   7.19685673,   7.90604321,   8.68511374,
         9.54095476,  10.48113134,  11.51395399,  12.64855217,
        13.89495494,  15.26417967,  16.76832937,  18.42069969,
        20.23589648,  22.22996483,  24.42053095,  26.82695795,
        29.47051703,  32.37457543,  35.56480306,  39.06939937,
        42.9193426 ,  47.14866363,  51.79474679,  56.89866029,
    

In [105]:
pred2=search2.best_estimator_.predict(X_test2)

In [106]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test2,pred2))}")

The root mean square error is 0.7526619427073485


In [107]:
trained2 = fit_all([lsv2,sv2,lg2,mnb2,xgb2,rf2],X_train2,y_train2)

In [108]:
##LinearSV
lsvc2_pred= trained2[0].predict(X_test2)
##SVC
svc2_pred= trained2[1].predict(X_test2)
##Logistic
lg2_pred= trained2[2].predict(X_test2)
##MultinomialNB
mnb2_pred= trained2[3].predict(X_test2)
##Xgboost
xgb2_pred = trained2[4].predict(X_test2)
##Random forest
rf2_pred = trained2[5].predict(X_test2)

In [109]:
print('LinearSV')
print(np.sqrt(mean_squared_error(y_test2,lsvc2_pred)))
print('\n\n')
print('SVC')
print(np.sqrt(mean_squared_error(y_test2,svc2_pred)))
print('\n\n')
print('LOgistc')
print(np.sqrt(mean_squared_error(y_test2,lg2_pred)))
print('\n\n')
print('Multinomial')
print(np.sqrt(mean_squared_error(y_test2,mnb2_pred)))
print('\n\n')
print('Xgboot')
print(np.sqrt(mean_squared_error(y_test2,xgb2_pred)))
print('\n\n')
print('Random forest')
print(np.sqrt(mean_squared_error(y_test2,rf2_pred)))

LinearSV
0.6931810730249348



SVC
0.698212002188447



LOgistc
0.6989277502002621



Multinomial
0.7402702209328699



Xgboot
0.7099295739719539



Random forest
0.7276675064890558


In [110]:
from nltk.stem import WordNetLemmatizer
##lemalization
def lemmatise(string1):
    lem = WordNetLemmatizer()
    totenizer=RegexpTokenizer(r'\w+')
    
    return ' '.join(lem.lemmatize(i) for i in totenizer.tokenize(string1))

In [111]:
lemmatise('i am currently doing my degrees in computer sciences')

'i am currently doing my degree in computer science'

In [112]:
df1['lemmatized']=df1['no_stop'].apply(lemmatise)

In [113]:
df1.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,no_puct,lower_case,no_numbers,no_stop,lemmatized
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0,Me amp The Big Homie meanboy3000 MEANBOY MB MB...,me amp the big homie meanboy3000 meanboy mb mb...,me amp the big homie meanboy meanboy mb mbs mm...,amp big homie meanboy meanboy mb mbs mmr stegm...,amp big homie meanboy meanboy mb mb mmr stegma...
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0,I m 100 thinking of devoting my career to prov...,i m 100 thinking of devoting my career to prov...,i m thinking of devoting my career to proving...,thinking devoting career proving autism caused...,thinking devoting career proving autism caused...
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0,whatcausesautism VACCINES DO NOT VACCINATE YOU...,whatcausesautism vaccines do not vaccinate you...,whatcausesautism vaccines do not vaccinate you...,whatcausesautism vaccines vaccinate child,whatcausesautism vaccine vaccinate child
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0,I mean if they immunize my kid with something ...,i mean if they immunize my kid with something ...,i mean if they immunize my kid with something ...,mean immunize kid something secretly kill year...,mean immunize kid something secretly kill year...
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0,Thanks to user Catch me performing at La Nuit ...,thanks to user catch me performing at la nuit ...,thanks to user catch me performing at la nuit ...,thanks user catch performing la nuit nyc st av...,thanks user catch performing la nuit nyc st av...


In [114]:
X3 = df1['lemmatized']
y3 = df1['label']

In [115]:
X_train3, X_test3, y_train3, y_test3 = train_test_split(X3, y3, test_size=0.2, random_state=42)

In [116]:
lsv3 = Pipeline([('vec',TfidfVectorizer()),('clf',LinearSVC())])
sv3 = Pipeline([('vec',TfidfVectorizer()),('clf',SVC())])
lg3 = Pipeline([('vec',TfidfVectorizer()),('clf',LogisticRegression(max_iter=1000))])
mnb3 = Pipeline([('vec',TfidfVectorizer()),('clf',MultinomialNB())])
xgb3 = Pipeline([('vec',TfidfVectorizer()),('clf',XGBClassifier())])
rf3 = Pipeline([('vec',TfidfVectorizer()),('clf',RandomForestClassifier())])

In [117]:
param_grid = {
    'clf__C': np.logspace(0, 2)
}
search3 = GridSearchCV(lg, param_grid, n_jobs=-1)
search3.fit(X_train3, y_train3)

GridSearchCV(estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf',
                                        LogisticRegression(class_weight='balanced',
                                                           max_iter=10000))]),
             n_jobs=-1,
             param_grid={'clf__C': array([  1.        ,   1.09854114,   1.20679264,   1.32571137,
         1.45634848,   1.59985872,   1.75751062,   1.93069773,
         2.12095089,   2.32995181,   2.55954792,   2.8117687 ,
         3.0888436 ,   3.39322177,   3.72759372,   4.09491506,
         4.49843267,   4...2,
         6.55128557,   7.19685673,   7.90604321,   8.68511374,
         9.54095476,  10.48113134,  11.51395399,  12.64855217,
        13.89495494,  15.26417967,  16.76832937,  18.42069969,
        20.23589648,  22.22996483,  24.42053095,  26.82695795,
        29.47051703,  32.37457543,  35.56480306,  39.06939937,
        42.9193426 ,  47.14866363,  51.79474679,  56.89866029,
    

In [118]:
pred3=search3.best_estimator_.predict(X_test3)

In [119]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test3,pred3))}")

The root mean square error is 0.746659226153404


In [120]:
trained3 = fit_all([lsv3,sv3,lg3,mnb3,xgb3,rf3],X_train3,y_train3)

In [121]:
##LinearSV
lsvc3_pred= trained3[0].predict(X_test3)
##SVC
svc3_pred= trained3[1].predict(X_test3)
##Logistic
lg3_pred= trained3[2].predict(X_test3)
##MultinomialNB
mnb3_pred= trained3[3].predict(X_test3)
##Xgboost
xgb3_pred = trained3[4].predict(X_test3)
##Random forest
rf3_pred = trained3[5].predict(X_test3)

In [122]:
print('LinearSV')
print(np.sqrt(mean_squared_error(y_test3,lsvc3_pred)))
print('\n\n')
print('SVC')
print(np.sqrt(mean_squared_error(y_test3,svc3_pred)))
print('\n\n')
print('LOgistc')
print(np.sqrt(mean_squared_error(y_test3,lg3_pred)))
print('\n\n')
print('Multinomial')
print(np.sqrt(mean_squared_error(y_test3,mnb3_pred)))
print('\n\n')
print('Xgboot')
print(np.sqrt(mean_squared_error(y_test3,xgb3_pred)))
print('\n\n')
print('Random forest')
print(np.sqrt(mean_squared_error(y_test3,rf3_pred)))

LinearSV
0.6931810730249348



SVC
0.6844705983459042



LOgistc
0.6953416426476987



Multinomial
0.73824115301167



Xgboot
0.7231873892705818



Random forest
0.7224956747275377


In [123]:
##preparing the test set
df2.head()

Unnamed: 0,tweet_id,safe_text,label,no_punc,lower_case,no_numbers
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,-1.0,user user amp 4 a vaccine given 2 healthy peep...,user user amp 4 a vaccine given 2 healthy peep...,user user amp a vaccine given healthy peeps ...
1,00UNMD0E,Students starting school without whooping coug...,0.0,Students starting school without whooping coug...,students starting school without whooping coug...,students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0.0,I m kinda over every ep of user being ripped f...,i m kinda over every ep of user being ripped f...,i m kinda over every ep of user being ripped f...
3,01HOEQJW,How many innocent children die for lack of vac...,1.0,How many innocent children die for lack of vac...,how many innocent children die for lack of vac...,how many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0.0,CDC eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine for humans though ...


In [124]:
df2['no_stop'] = df2['no_numbers'].apply(remove_stop)

In [125]:
df2['lemmatized'] = df2['no_stop'].apply(lemmatise)

In [126]:
X_val3 = df2['lemmatized']

In [127]:
df2['label']=search3.best_estimator_.predict(X_val3)

In [128]:
df2.head()

Unnamed: 0,tweet_id,safe_text,label,no_punc,lower_case,no_numbers,no_stop,lemmatized
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...,-1.0,user user amp 4 a vaccine given 2 healthy peep...,user user amp 4 a vaccine given 2 healthy peep...,user user amp a vaccine given healthy peeps ...,user user amp vaccine given healthy peeps fda ...,user user amp vaccine given healthy peep fda t...
1,00UNMD0E,Students starting school without whooping coug...,1.0,Students starting school without whooping coug...,students starting school without whooping coug...,students starting school without whooping coug...,students starting school without whooping coug...,student starting school without whooping cough...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe...",0.0,I m kinda over every ep of user being ripped f...,i m kinda over every ep of user being ripped f...,i m kinda over every ep of user being ripped f...,kinda every ep user ripped headlines measles l...,kinda every ep user ripped headline measles le...
3,01HOEQJW,How many innocent children die for lack of vac...,1.0,How many innocent children die for lack of vac...,how many innocent children die for lack of vac...,how many innocent children die for lack of vac...,many innocent children die lack vaccination ye...,many innocent child die lack vaccination year ...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though...",0.0,CDC eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine for humans though ...,cdc eyeing bird flu vaccine humans though risk...,cdc eyeing bird flu vaccine human though risk ...


In [129]:
submission6 = df2[['tweet_id','label']]

In [130]:
submission6.to_csv('submission6.csv',index=False)

In [131]:
##################################################### Random forest ############################################################


In [132]:
rfs= Pipeline([('vec',TfidfVectorizer()),('clf',RandomForestClassifier())])

In [133]:
param_grid = {
    'clf__n_estimators': range(20,1000,20)
}
searchr1 = GridSearchCV(rfs, param_grid, n_jobs=-1)
searchr1.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf', RandomForestClassifier())]),
             n_jobs=-1, param_grid={'clf__n_estimators': range(20, 1000, 20)})

In [134]:
predr1 = searchr1.best_estimator_.predict(X_test)

In [135]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,predr1))}")

The root mean square error is 0.726636084983398


In [136]:
#############################################################################################################################

In [137]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [138]:
knn= Pipeline([('vec',TfidfVectorizer()),('clf',KNeighborsClassifier())])

In [139]:
knn.fit(X_train,y_train)

Pipeline(steps=[('vec', TfidfVectorizer()), ('clf', KNeighborsClassifier())])

In [140]:
predKnn = knn.predict(X_test)

In [141]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,predKnn))}")

The root mean square error is 0.7249137879775773


In [142]:
param_grid = {
    'clf__n_neighbors':range(1,20)
}
searchknn= GridSearchCV(knn, param_grid, n_jobs=-1)
searchknn.fit(X_train, y_train)

GridSearchCV(estimator=Pipeline(steps=[('vec', TfidfVectorizer()),
                                       ('clf', KNeighborsClassifier())]),
             n_jobs=-1, param_grid={'clf__n_neighbors': range(1, 20)})

In [143]:
predKnn1 =searchknn.best_estimator_.predict(X_test)
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,predKnn1))}")

The root mean square error is 0.7039176088151227


In [144]:
##decisionTree
dt= Pipeline([('vec',TfidfVectorizer()),('clf',DecisionTreeClassifier())])

In [145]:
dt.fit(X_train,y_train)

Pipeline(steps=[('vec', TfidfVectorizer()), ('clf', DecisionTreeClassifier())])

In [146]:
preddt = dt.predict(X_test)

In [147]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,preddt))}")

The root mean square error is 0.7861297602813418


In [148]:
###nueral netwoks
from sklearn.neural_network import MLPClassifier

In [149]:
nn= Pipeline([('vec',TfidfVectorizer()),('clf',MLPClassifier())])

In [150]:
nn.fit(X_train2,y_train2)

Pipeline(steps=[('vec', TfidfVectorizer()), ('clf', MLPClassifier())])

In [151]:
prednn = nn.predict(X_test2)

In [152]:
print(f"The root mean square error is {np.sqrt(mean_squared_error(y_test,prednn))}")

The root mean square error is 0.7784600182411425
