In [53]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
import re
import emoji
from sklearn.feature_extraction.text import TfidfVectorizer

In [1]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Unzipping corpora/twitter_samples.zip.


True

In [4]:
from nltk.corpus  import twitter_samples

positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')


In [12]:
positive = pd.DataFrame(positive_tweets)
negative = pd.DataFrame(negative_tweets)

In [15]:
positive.rename(columns= {0: "Tweet Data"}, inplace=True)
negative.rename(columns= {0: "Tweet Data"}, inplace=True)

In [18]:
positive["Label"] = 1
negative["Label"] = 0

In [21]:
all_data = pd.concat([positive,negative], ignore_index=True)
all_data

Unnamed: 0,Tweet Data,Label
0,#FollowFriday @France_Inte @PKuchly57 @Milipol...,1
1,@Lamb2ja Hey James! How odd :/ Please call our...,1
2,@DespiteOfficial we had a listen last night :)...,1
3,@97sides CONGRATS :),1
4,yeaaaah yippppy!!! my accnt verified rqst has...,1
...,...,...
9995,I wanna change my avi but uSanele :(,0
9996,MY PUPPY BROKE HER FOOT :(,0
9997,where's all the jaebum baby pictures :((,0
9998,But but Mr Ahmad Maslan cooks too :( https://t...,0


In [26]:
all_data["Tweet Data"]

0       #FollowFriday @France_Inte @PKuchly57 @Milipol...
1       @Lamb2ja Hey James! How odd :/ Please call our...
2       @DespiteOfficial we had a listen last night :)...
3                                    @97sides CONGRATS :)
4       yeaaaah yippppy!!!  my accnt verified rqst has...
                              ...                        
9995                 I wanna change my avi but uSanele :(
9996                           MY PUPPY BROKE HER FOOT :(
9997             where's all the jaebum baby pictures :((
9998    But but Mr Ahmad Maslan cooks too :( https://t...
9999    @eawoman As a Hull supporter I am expecting a ...
Name: Tweet Data, Length: 10000, dtype: object

In [32]:
emoticon_string = r"""
    (?:
      [<>]?
      [:;=8]                     # eyes
      [\-o\*\']?                 # optional nose
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      |
      [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth
      [\-o\*\']?                 # optional nose
      [:;=8]                     # eyes
      [<>]?
      |
      </?3                       # heart
    )"""
    
emoticon_re = re.compile(emoticon_string, re.VERBOSE | re.I | re.UNICODE)

In [44]:
procc_data=[]
for data in all_data['Tweet Data']:
  preproc = re.sub('\n', ' ', data)
  preproc = re.sub('(@|https?)\S+|#'," ",preproc)
  preproc = preproc.lower()
  preproc = emoji.get_emoji_regexp().sub(u'', preproc)
  preproc = re.sub(emoticon_re, '', preproc)
  preproc = re.sub('\d+', '', preproc)
  preproc = re.sub(r'[^\w\s]','', preproc)
  procc_data.append(preproc)

In [45]:
proccessed_df = pd.DataFrame(procc_data)

In [46]:
all_data['Tweet Data'] = proccessed_df[0]

In [47]:
all_data

Unnamed: 0,Tweet Data,Label
0,followfriday for being top engaged memb...,1
1,hey james how odd please call our contact c...,1
2,we had a listen last night as you bleed is ...,1
3,congrats,1
4,yeaaaah yippppy my accnt verified rqst has su...,1
...,...,...
9995,i wanna change my avi but usanele,0
9996,my puppy broke her foot,0
9997,wheres all the jaebum baby pictures,0
9998,but but mr ahmad maslan cooks too,0


In [51]:
train,test = train_test_split(all_data, test_size=0.4, shuffle=True)
test

Unnamed: 0,Tweet Data,Label
3941,you aint wrong there buddy,1
7428,i told myself i can survive living alone for t...,0
693,follow me please olly,1
4522,indeed,1
3211,yes cass i think you should,1
...,...,...
5716,gutted for reynold the kind of desserts he has...,0
7490,update via,0
7364,oh thats a shame were really looking for pe...,0
7758,im so sorry about my spam guys,0


In [55]:
tfidf_vectorizer = TfidfVectorizer(lowercase = False)
tfidf_representation = tfidf_vectorizer.fit(train['Tweet Data'])

X_train = tfidf_vectorizer.transform(train['Tweet Data'])
X_test = tfidf_vectorizer.transform(test['Tweet Data'])

y_train = train['Label']
y_test = test['Label']

In [56]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('Train Score', logreg.score(X_train, y_train))
print('Test Score', logreg.score(X_test, y_test))

Train Score 0.8826666666666667
Test Score 0.75975


In [67]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
print('Train Score', reg.score(X_train, y_train))
print('Test Score', reg.score(X_test, y_test))
print("The best possible score is 1.0 and it is negative (because the model is arbitrarily worse).")
# reg.predict(y_test)

Train Score 0.9443506635912338
Test Score -1.3930334916196712
The best possible score is 1.0 and it is negative (because the model is arbitrarily worse).


In [71]:
from sklearn.linear_model import RidgeClassifier
clf = RidgeClassifier()
clf.fit(X_train, y_train)
print('Train Score', clf.score(X_train, y_train))
print('Test Score', clf.score(X_test, y_test))

Train Score 0.9485
Test Score 0.75625


In [72]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)
print('Train Score', clf.score(X_train, y_train))
print('Test Score', clf.score(X_test, y_test))

Train Score 0.6973236933670302
Test Score 0.3265988417136828
