In [1]:
import re
import string
import nltk
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...
...,...,...,...
7915,7916,0,Live out loud #lol #liveoutloud #selfie #smile...
7916,7917,0,We would like to wish you an amazing day! Make...
7917,7918,0,Helping my lovely 90 year old neighbor with he...
7918,7919,0,Finally got my #smart #pocket #wifi stay conne...


In [4]:
test

Unnamed: 0,id,tweet
0,7921,I hate the new #iphone upgrade. Won't let me d...
1,7922,currently shitting my fucking pants. #apple #i...
2,7923,"I'd like to puts some CD-ROMS on my iPad, is t..."
3,7924,My ipod is officially dead. I lost all my pict...
4,7925,Been fighting iTunes all night! I only want th...
...,...,...
1948,9869,"#SamsungGalaxyNote7 Explodes, Burns 6-Year-Old..."
1949,9870,Now Available - Hoodie. Check it out here - ht...
1950,9871,There goes a crack right across the screen. If...
1951,9872,@codeofinterest as i said #Adobe big time we m...


In [5]:
train.shape

(7920, 3)

In [6]:
test.shape

(1953, 2)

In [7]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
merged = train.append(test)
merged

Unnamed: 0,id,label,tweet
0,1,0.0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0.0,Finally a transparant silicon case ^^ Thanks t...
2,3,0.0,We love this! Would you go? #talk #makememorie...
3,4,0.0,I'm wired I know I'm George I was made that wa...
4,5,1.0,What amazing service! Apple won't even talk to...
...,...,...,...
1948,9869,,"#SamsungGalaxyNote7 Explodes, Burns 6-Year-Old..."
1949,9870,,Now Available - Hoodie. Check it out here - ht...
1950,9871,,There goes a crack right across the screen. If...
1951,9872,,@codeofinterest as i said #Adobe big time we m...


## DATA PREPROCESSING

In [9]:
reviews = merged.tweet
reviews

0       #fingerprint #Pregnancy Test https://goo.gl/h1...
1       Finally a transparant silicon case ^^ Thanks t...
2       We love this! Would you go? #talk #makememorie...
3       I'm wired I know I'm George I was made that wa...
4       What amazing service! Apple won't even talk to...
                              ...                        
1948    #SamsungGalaxyNote7 Explodes, Burns 6-Year-Old...
1949    Now Available - Hoodie. Check it out here - ht...
1950    There goes a crack right across the screen. If...
1951    @codeofinterest as i said #Adobe big time we m...
1952    Finally I got it .. thanx my father .. #Samsun...
Name: tweet, Length: 9873, dtype: object

In [10]:
reviews = reviews.str.replace("[^a-zA-Z]+", " ")
reviews

0        fingerprint Pregnancy Test https goo gl h MfQ...
1       Finally a transparant silicon case Thanks to m...
2       We love this Would you go talk makememories un...
3       I m wired I know I m George I was made that wa...
4       What amazing service Apple won t even talk to ...
                              ...                        
1948     SamsungGalaxyNote Explodes Burns Year Old Tha...
1949    Now Available Hoodie Check it out here http ze...
1950    There goes a crack right across the screen If ...
1951     codeofinterest as i said Adobe big time we ma...
1952    Finally I got it thanx my father Samsung galax...
Name: tweet, Length: 9873, dtype: object

In [16]:
# Tokenizing and De tokenizing

from nltk import TweetTokenizer
from nltk.tokenize import word_tokenize
tk = TweetTokenizer()
reviews = reviews.apply(lambda x: tk.tokenize(x)).apply(lambda x: ' '.join(x))

In [17]:
reviews

0       fingerprint pregnanc test https goo gl h mfqv ...
1       final a transpar silicon case thank to my uncl...
2       we love this would you go talk makememori unpl...
3       i m wire i know i m georg i was made that way ...
4       what amaz servic appl won t even talk to me ab...
                              ...                        
1948    samsunggalaxynot explod burn year old thank fo...
1949    now avail hoodi check it out here http zetasup...
1950    there goe a crack right across the screen if y...
1951    codeofinterest as i said adob big time we may ...
1952    final i got it thanx my father samsung galaxi ...
Name: tweet, Length: 9873, dtype: object

In [18]:
# Stemming

from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer('english')
reviews = reviews.apply(lambda x: [stemmer.stem(i.lower()) for i in tk.tokenize(x)]).apply(lambda x: ' '.join(x))

In [19]:
reviews

0       fingerprint pregnanc test https goo gl h mfqv ...
1       final a transpar silicon case thank to my uncl...
2       we love this would you go talk makememori unpl...
3       i m wire i know i m georg i was made that way ...
4       what amaz servic appl won t even talk to me ab...
                              ...                        
1948    samsunggalaxynot explod burn year old thank fo...
1949    now avail hoodi check it out here http zetasup...
1950    there goe a crack right across the screen if y...
1951    codeofinterest as i said adob big time we may ...
1952    final i got it thanx my father samsung galaxi ...
Name: tweet, Length: 9873, dtype: object

In [21]:
# Removing stop words

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop = stopwords.words('english')
reviews = reviews.apply(lambda x: [i for i in word_tokenize(x) if i not in stop]).apply(lambda x: ' '.join(x))

In [22]:
reviews

0       fingerprint pregnanc test https goo gl h mfqv ...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       wire know georg made way iphon cute daventri h...
4       amaz servic appl even talk question unless pay...
                              ...                        
1948    samsunggalaxynot explod burn year old thank ru...
1949    avail hoodi check http zetasuppli co uk produc...
1950    goe crack right across screen could actual pro...
1951    codeofinterest said adob big time may well inc...
1952    final got thanx father samsung galaxi gift fat...
Name: tweet, Length: 9873, dtype: object

In [23]:
# Preprocessed merged data

merged.tweet = reviews
merged

Unnamed: 0,id,label,tweet
0,1,0.0,fingerprint pregnanc test https goo gl h mfqv ...
1,2,0.0,final transpar silicon case thank uncl yay son...
2,3,0.0,love would go talk makememori unplug relax iph...
3,4,0.0,wire know georg made way iphon cute daventri h...
4,5,1.0,amaz servic appl even talk question unless pay...
...,...,...,...
1948,9869,,samsunggalaxynot explod burn year old thank ru...
1949,9870,,avail hoodi check http zetasuppli co uk produc...
1950,9871,,goe crack right across screen could actual pro...
1951,9872,,codeofinterest said adob big time may well inc...


### Splitting merged preprocessed tweets back to train and test data

In [24]:
train_data = merged.iloc[:train.shape[0]]

In [25]:
test_data = merged.iloc[train.shape[0]:]

In [26]:
train_data

Unnamed: 0,id,label,tweet
0,1,0.0,fingerprint pregnanc test https goo gl h mfqv ...
1,2,0.0,final transpar silicon case thank uncl yay son...
2,3,0.0,love would go talk makememori unplug relax iph...
3,4,0.0,wire know georg made way iphon cute daventri h...
4,5,1.0,amaz servic appl even talk question unless pay...
...,...,...,...
7915,7916,0.0,live loud lol liveoutloud selfi smile soni mus...
7916,7917,0.0,would like wish amaz day make everi minut coun...
7917,7918,0.0,help love year old neighbor ipad morn made rea...
7918,7919,0.0,final got smart pocket wifi stay connect anyti...


In [27]:
test_data

Unnamed: 0,id,label,tweet
0,7921,,hate new iphon upgrad let download app ugh app...
1,7922,,current shit fuck pant appl imac cashmoney rad...
2,7923,,like put cd rom ipad possibl yes block screen
3,7924,,ipod offici dead lost pictur video sos concert...
4,7925,,fight itun night want music paid
...,...,...,...
1948,9869,,samsunggalaxynot explod burn year old thank ru...
1949,9870,,avail hoodi check http zetasuppli co uk produc...
1950,9871,,goe crack right across screen could actual pro...
1951,9872,,codeofinterest said adob big time may well inc...


In [33]:
train_review = train_data.tweet
train_review

0       fingerprint pregnanc test https goo gl h mfqv ...
1       final transpar silicon case thank uncl yay son...
2       love would go talk makememori unplug relax iph...
3       wire know georg made way iphon cute daventri h...
4       amaz servic appl even talk question unless pay...
                              ...                        
7915    live loud lol liveoutloud selfi smile soni mus...
7916    would like wish amaz day make everi minut coun...
7917    help love year old neighbor ipad morn made rea...
7918    final got smart pocket wifi stay connect anyti...
7919    appl barcelona appl store bcn barcelona travel...
Name: tweet, Length: 7920, dtype: object

In [34]:
test_review = test_data.tweet
test_review

0       hate new iphon upgrad let download app ugh app...
1       current shit fuck pant appl imac cashmoney rad...
2           like put cd rom ipad possibl yes block screen
3       ipod offici dead lost pictur video sos concert...
4                        fight itun night want music paid
                              ...                        
1948    samsunggalaxynot explod burn year old thank ru...
1949    avail hoodi check http zetasuppli co uk produc...
1950    goe crack right across screen could actual pro...
1951    codeofinterest said adob big time may well inc...
1952    final got thanx father samsung galaxi gift fat...
Name: tweet, Length: 1953, dtype: object

## Vectorization

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(stop_words = stop)

In [36]:
train_data_vec = vec.fit_transform(train_review)

In [37]:
train_data_vec

<7920x20198 sparse matrix of type '<class 'numpy.float64'>'
	with 109750 stored elements in Compressed Sparse Row format>

In [38]:
test_data_vec = vec.transform(test_review)

In [39]:
test_data_vec

<1953x20198 sparse matrix of type '<class 'numpy.float64'>'
	with 22970 stored elements in Compressed Sparse Row format>

In [40]:
# TAKING LABEL FROM train_data

y = train_data.label
y = y.values
y

array([0., 0., 0., ..., 0., 0., 0.])

In [41]:
# COUNTING LABEL VALUES 0 AND 1 TO CHECK THE IMBALANCE OF DATA

pd.Series(y).value_counts()

0.0    5894
1.0    2026
dtype: int64

## Splitting train and test data

In [42]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(train_data_vec, y, test_size = 0.2, stratify = y, random_state = 42)

In [43]:
# TO OVERCOME THE IMBALANCED DATA

from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_res, y_res = smote.fit_resample(x_train, y_train)

## RANDOM FOREST MODEL

In [44]:
from sklearn.ensemble import RandomForestClassifier
model1 = RandomForestClassifier()
model1.fit(x_res, y_res)
y_pred1 = model1.predict(x_test)

In [45]:
y_pred1

array([1., 0., 0., ..., 0., 0., 1.])

In [46]:
y_test

array([1., 0., 0., ..., 0., 0., 0.])

In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

         0.0       0.95      0.90      0.92      1179
         1.0       0.75      0.86      0.80       405

    accuracy                           0.89      1584
   macro avg       0.85      0.88      0.86      1584
weighted avg       0.90      0.89      0.89      1584



In [54]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred1)

0.7986191024165707

## Submission

In [60]:
test_data_vec

<1953x20198 sparse matrix of type '<class 'numpy.float64'>'
	with 22970 stored elements in Compressed Sparse Row format>

In [61]:
y_sub1 = model1.predict(test_data_vec)

In [62]:
print(y_sub1)

[1. 1. 1. ... 1. 1. 0.]


In [63]:
my_submission = pd.DataFrame({'id': test.id, 'label': y_sub1})

In [64]:
my_submission.to_csv('submission_RF.csv', index = False)