In [55]:
#Load the dataset
import pandas as pd

train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0


In [56]:
for index, tweet in enumerate(train["safe_text"][10:15]):
    print(index+1,".",tweet)

1 . <user> @ this point I have 2 text, butw/Bon Jovi cover playin @ Alibi's hope U can come out 2 MMR BBQ<user> will b there!
2 . My prediction, vaccine exemption in Arizona will end soon. To much money is being lost by big pharma.
3 . Getting my vaccines ! #china #nervous #moving .. With Cheryl (@ Southern Nevada Health District) <url>
4 . 1$Mug Noche <user> #mmr #mixmasterrod #dcdj #mmr   @ Mad Hatter <url>
5 . Got my Influenza Vaccine! (@ Purdue University Student Health (PUSH) - <user> <url>


**Note :- Noise present in Tweets**

 - If you look closely, you'll see that there are many hashtags present in the tweets of the form `#` symbol followed by text. We particularly don't need the `#` symbol so we will clean it out.
 - Also, there are strange symbols like `&` and `@` in tweet 4. This is actually `unicode` characters that is present in our dataset that we need to get rid of because they don't particularly add anything meaningful.
 - There are also numerals and percentages .

### 2. Data Cleaning

In [57]:
import re

#Clean text from noise
def clean_text(text):
    #Filter to allow only alphabets
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    
    #Remove Unicode characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

In [58]:
train['clean_text'] = train.safe_text.apply(lambda x: clean_text(x))

In [59]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,<user> <user> ... &amp; 4 a vaccine given 2 he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,"I'm kinda over every ep of <user> being ""rippe..."
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,"CDC eyeing bird flu vaccine for humans, though..."


In [60]:
#Clean text from noise the intergers and floats
def cleaning_text(text):
    #Filter to allow only alphabets
    text = re.sub("(\d*\.\d+)|(\d+\.[0-9 ]+)","",text)
    
    #Convert to lowercase to maintain consistency
    text = text.lower()
       
    return text

In [61]:
test['safe_text'] = test['safe_text'].str.replace("[^a-zA-Z]", " ")


In [62]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,user user amp a vaccine given he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,I m kinda over every ep of user being rippe...
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,CDC eyeing bird flu vaccine for humans though...


In [63]:
train['clean_text'] = train.safe_text.apply(lambda x: cleaning_text(x))

#                                    <bold>Feature Engineering</bold>
Feature engineering is the science (and art) of extracting more information from existing data. You are not adding any new data here, but you are actually making the data you already have more useful.
The machine learning model does not understand text directly, so we create numerical features that reperesant the underlying text.
In this module, you'll deal with very basic NLP based features and as you progress further in the course you'll come across more complex and efficient ways of doing the same.

In [64]:
#Exhaustive list of stopwords in the english language. We want to focus less on these so at some point will have to filter
STOP_WORDS = ['a', 'about', 'above', 'after', 'again', 'against', 'all', 'also', 'am', 'an', 'and',
              'any', 'are', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below',
              'between', 'both', 'but', 'by', 'can', "can't", 'cannot', 'com', 'could', "couldn't", 'did',
              "didn't", 'do', 'does', "doesn't", 'doing', "don't", 'down', 'during', 'each', 'else', 'ever',
              'few', 'for', 'from', 'further', 'get', 'had', "hadn't", 'has', "hasn't", 'have', "haven't", 'having',
              'he', "he'd", "he'll", "he's", 'her', 'here', "here's", 'hers', 'herself', 'him', 'himself', 'his', 'how',
              "how's", 'however', 'http', 'i', "i'd", "i'll", "i'm", "i've", 'if', 'in', 'into', 'is', "isn't", 'it',
              "it's", 'its', 'itself', 'just', 'k', "let's", 'like', 'me', 'more', 'most', "mustn't", 'my', 'myself',
              'no', 'nor', 'not', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'otherwise', 'ought', 'our', 'ours',
              'ourselves', 'out', 'over', 'own', 'r', 'same', 'shall', "shan't", 'she', "she'd", "she'll", "she's",
              'should', "shouldn't", 'since', 'so', 'some', 'such', 'than', 'that', "that's", 'the', 'their', 'theirs',
              'them', 'themselves', 'then', 'there', "there's", 'these', 'they', "they'd", "they'll", "they're",
              "they've", 'this', 'those', 'through', 'to', 'too', 'under', 'until', 'up', 'very', 'was', "wasn't",
              'we', "we'd", "we'll", "we're", "we've", 'were', "weren't", 'what', "what's", 'when', "when's", 'where',
              "where's", 'which', 'while', 'who', "who's", 'whom', 'why', "why's", 'with', "won't", 'would', "wouldn't",
              'www', 'you', "you'd", "you'll", "you're", "you've", 'your', 'yours', 'yourself', 'yourselves']

#Generate word frequency
def gen_freq(text):
    #Will store the list of words
    word_list = []

    #Loop over all the tweets and extract words into word_list
    for tw_words in text.split():
        word_list.extend(tw_words)

    #Create word frequencies using word_list
    word_freq = pd.Series(word_list).value_counts()
    
    #Drop the stopwords during the frequency calculation
    word_freq = word_freq.drop(STOP_WORDS, errors='ignore')
    
    return word_freq

#Check whether a negation term is present in the text
def any_neg(words):
    for word in words:
        if word in ['n', 'no', 'non', 'not'] or re.search(r"\wn't", word):
            return 1
    else:
        return 0

#Check whether one of the 100 rare words is present in the text
def any_rare(words, rare_100):
    for word in words:
        if word in rare_100:
            return 1
    else:
        return 0

#Check whether prompt words are present
def is_question(words):
    for word in words:
        if word in ['when', 'what', 'how', 'why', 'who']:
            return 1
    else:
        return 0

In [65]:
#training dataset
word_freq = gen_freq(train.clean_text.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]
#Number of words in a tweet
train['word_count'] = train.clean_text.str.split().apply(lambda x: len(x))
#Negation present or not
train['any_neg'] = train.clean_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
train['is_question'] = train.clean_text.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
train['any_rare'] = train.clean_text.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
train['char_count'] = train.clean_text.apply(lambda x: len(x))

In [66]:
train.head()

Unnamed: 0,tweet_id,safe_text,label,agreement,clean_text,word_count,any_neg,is_question,any_rare,char_count
0,CL1KWCMY,Me &amp; The Big Homie meanboy3000 #MEANBOY #M...,0.0,1.0,me &amp; the big homie meanboy3000 #meanboy #m...,15,0,0,0,90
1,E3303EME,I'm 100% thinking of devoting my career to pro...,1.0,1.0,i'm 100% thinking of devoting my career to pro...,25,1,0,0,140
2,M4IVFSMS,"#whatcausesautism VACCINES, DO NOT VACCINATE Y...",-1.0,1.0,"#whatcausesautism vaccines, do not vaccinate y...",7,1,0,0,55
3,1DR6ROZ4,I mean if they immunize my kid with something ...,-1.0,1.0,i mean if they immunize my kid with something ...,28,1,0,0,138
4,J77ENIIE,Thanks to <user> Catch me performing at La Nui...,0.0,1.0,thanks to <user> catch me performing at la nui...,20,0,0,0,106


In [67]:
train['label'] = train.label.fillna(-1)

In [68]:
train.isna().sum()

tweet_id       0
safe_text      0
label          0
agreement      2
clean_text     0
word_count     0
any_neg        0
is_question    0
any_rare       0
char_count     0
dtype: int64

In [69]:
X = train[['word_count', 'any_neg', 'any_rare', 'char_count', 'is_question']]
y = train.label

In [70]:
test.head()

Unnamed: 0,tweet_id,safe_text
0,00BHHHP1,user user amp a vaccine given he...
1,00UNMD0E,Students starting school without whooping coug...
2,01AXPTJF,I m kinda over every ep of user being rippe...
3,01HOEQJW,How many innocent children die for lack of vac...
4,01JUKMAO,CDC eyeing bird flu vaccine for humans though...


In [71]:
test['safe_text'] = test['safe_text'].fillna('xxxxxx')

In [72]:
#testing dataset
word_freq = gen_freq(test.safe_text.str)
#100 most rare words in the dataset
rare_100 = word_freq[-100:]
#Number of words in a tweet
test['word_count'] = test.safe_text.str.split().apply(lambda x: len(x))
#Negation present or not
test['any_neg'] = test.safe_text.str.split().apply(lambda x: any_neg(x))
#Prompt present or not
test['is_question'] = test.safe_text.str.split().apply(lambda x: is_question(x))
#Any of the most 100 rare words present or not
test['any_rare'] = test.safe_text.str.split().apply(lambda x: any_rare(x, rare_100))
#Character count of the tweet
test['char_count'] = test.safe_text.apply(lambda x: len(x))

In [73]:
test.head()

Unnamed: 0,tweet_id,safe_text,word_count,any_neg,is_question,any_rare,char_count
0,00BHHHP1,user user amp a vaccine given he...,17,1,0,0,110
1,00UNMD0E,Students starting school without whooping coug...,9,0,0,0,74
2,01AXPTJF,I m kinda over every ep of user being rippe...,21,0,0,0,107
3,01HOEQJW,How many innocent children die for lack of vac...,23,0,0,0,136
4,01JUKMAO,CDC eyeing bird flu vaccine for humans though...,20,0,0,0,120


In [74]:
test= test[['word_count', 'any_neg', 'any_rare', 'char_count', 'is_question']]

In [75]:
from sklearn.naive_bayes import GaussianNB

#Initialize GaussianNB classifier
model = GaussianNB()
#Fit the model on the train dataset
model = model.fit(X, y)

In [76]:
#Make predictions on the test dataset
pred = model.predict(test)

In [77]:
SampleSubmission = pd.read_csv('SampleSubmission.csv')

In [78]:
SampleSubmission = SampleSubmission.drop(['label'], axis=1)

In [79]:
SampleSubmission['label'] = pred

In [80]:
SampleSubmission.head()

Unnamed: 0,tweet_id,label
0,00BHHHP1,1.0
1,00UNMD0E,0.0
2,01AXPTJF,0.0
3,01HOEQJW,0.0
4,01JUKMAO,0.0


In [81]:
SampleSubmission.to_csv('SampleSubmission.csv', index=False)

In [82]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [83]:
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

In [87]:
param_test = {'reg_alpha':[1e-5, 1e-2, 0.1, 100]}

gsearch = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1,
                                                 n_estimators=10,
                                                 max_depth=5,
                                                 min_child_weight=2,
                                                 gamma=0.1,
                                                 subsample=0.85,
                                                 colsample_bytree=0.8,
                                                 objective= 'binary:logistic',
                                                 nthread=4,
                                                 scale_pos_weight=1,
                                                 seed=27), 
                       param_grid = param_test,
                       n_jobs=20,
                       iid=False, 
                       cv=2, verbose=10)

gsearch.fit(X,y)

print('Best Grid Search Parameters :',gsearch.best_params_)
print('Best Grid Search Score : ',gsearch.best_score_)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   2 out of   8 | elapsed:    1.0s remaining:    3.1s
[Parallel(n_jobs=20)]: Done   3 out of   8 | elapsed:    1.2s remaining:    2.1s
[Parallel(n_jobs=20)]: Done   4 out of   8 | elapsed:    1.2s remaining:    1.2s
[Parallel(n_jobs=20)]: Done   5 out of   8 | elapsed:    1.2s remaining:    0.7s
[Parallel(n_jobs=20)]: Done   6 out of   8 | elapsed:    1.2s remaining:    0.3s
[Parallel(n_jobs=20)]: Done   8 out of   8 | elapsed:   30.7s remaining:    0.0s
[Parallel(n_jobs=20)]: Done   8 out of   8 | elapsed:   30.7s finished


Best Grid Search Parameters : {'reg_alpha': 1e-05}
Best Grid Search Score :  0.5618431913617277


In [96]:
pred = gsearch.predict(test)

In [97]:
pred

array([1., 0., 0., ..., 0., 0., 0.])

In [98]:
SampleSubmission = pd.read_csv('SampleSubmission.csv')

In [100]:
SampleSubmission = SampleSubmission.drop(['label'], axis=1)

In [102]:
SampleSubmission['label'] = pred

In [103]:
SampleSubmission.head()

Unnamed: 0,tweet_id,label
0,00BHHHP1,1.0
1,00UNMD0E,0.0
2,01AXPTJF,0.0
3,01HOEQJW,1.0
4,01JUKMAO,0.0


In [104]:
SampleSubmission.to_csv('SampleSubmission.csv', index=False)