In [16]:
import numpy as np
import pandas as pd
import re
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import word_tokenize
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline

In [17]:
# We are using this dataset from kaggle to train our models on as well as to get our pre-processing process done: https://www.kaggle.com/ramyavidiyala/twitter-tweets-data-for-sentiment-analysis

In [18]:
#reading in the data

data = pd.read_csv("Original Datasets/tweets_data.csv", usecols=['label','tweet'])
data.head(10)

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation
5,0,[2/2] huge fan fare and big talking before the...
6,0,@user camping tomorrow @user @user @user @use...
7,0,the next school year is the year for exams.ð...
8,0,we won!!! love the land!!! #allin #cavs #champ...
9,0,@user @user welcome here ! i'm it's so #gr...


In [19]:
#remove special characters
l=[]
for sentence in data.tweet:
    l.append(re.sub('[^A-Za-z]+', ' ', sentence))
data.tweet = l

In [20]:
# Splitting train test

x = data.tweet
y = data.label

X_train, X_test, y_train, y_test= train_test_split(x, y, test_size=0.3, random_state=0)

#X_train
#y_train

X_test

24078     user less than five hours until euro kicks of...
22793     frendship team work together couple banker hi...
29880                          user user thanks once more 
20387    thank you ramadhan ramadhankareem hea love hol...
31381    alton towers the other day altontowers smile s...
                               ...                        
13123                            cat kitty clifford ponce 
19648    manga is ready after see miles today capferrat...
9845     another melbourne snap this guy played the mos...
10799                         user thanks for the retweet 
2732      user user kicks off today check out the full ...
Name: tweet, Length: 22373, dtype: object

In [21]:
# cleaning
stop = stopwords.words('english')

def remove_punctuation(text):
    import string
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)
#consider removing punctuation
def remove_stopwords(text):
    text = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(text)

def remove_whitespace(word):
    result = word.strip()
    return result

def replace_newline(word):
    return word.replace('\n','')

def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

def remove_mentions(word):
    return re.sub(r"@\S+", "", word)

def remove_number(word):
    result = re.sub(r'\d+', '', word)
    return result

X_train = X_train.apply(remove_punctuation).apply(remove_stopwords).apply(remove_whitespace).apply(replace_newline).apply(remove_hyperlink).apply(remove_mentions).apply(remove_number)

print(X_train)

24078    user less five hours euro kicks less hour week...
22793    frendship team work together couple banker hit...
29880                                     user user thanks
20387    thank ramadhan ramadhankareem hea love holy mo...
31381            alton towers day altontowers smile selfie
                               ...                        
13123                             cat kitty clifford ponce
19648    manga ready see miles today capferrat boat hol...
9845     another melbourne snap guy played beautiful so...
10799                                  user thanks retweet
2732     user user kicks today check full list guests s...
Name: tweet, Length: 22373, dtype: object


In [22]:
#tokenization
from nltk.tokenize import word_tokenize
X_train = X_train.apply(word_tokenize)

print(X_train)

24078    [user, less, five, hours, euro, kicks, less, h...
22793    [frendship, team, work, together, couple, bank...
29880                                 [user, user, thanks]
20387    [thank, ramadhan, ramadhankareem, hea, love, h...
31381     [alton, towers, day, altontowers, smile, selfie]
                               ...                        
13123                        [cat, kitty, clifford, ponce]
19648    [manga, ready, see, miles, today, capferrat, b...
9845     [another, melbourne, snap, guy, played, beauti...
10799                              [user, thanks, retweet]
2732     [user, user, kicks, today, check, full, list, ...
Name: tweet, Length: 22373, dtype: object


In [23]:
#stemming
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text])
X_train = X_train.apply(lambda text: stem_words(text))

print(X_train)

24078    user less five hour euro kick less hour weeken...
22793    frendship team work togeth coupl banker hit ba...
29880                                      user user thank
20387    thank ramadhan ramadhankareem hea love holi mo...
31381                 alton tower day altontow smile selfi
                               ...                        
13123                              cat kitti clifford ponc
19648    manga readi see mile today capferrat boat holi...
9845     anoth melbourn snap guy play beauti sound inst...
10799                                   user thank retweet
2732     user user kick today check full list guest see...
Name: tweet, Length: 22373, dtype: object


In [24]:
#bag of words using count vectorization

bag=CountVectorizer( min_df=2, max_features=1000)
bag.fit(X_train)
bag_df=bag.transform(X_train).toarray()

bag_df.shape

#bag.vocabulary_
print(bag_df)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [31]:
lr = LogisticRegression(max_iter = 1000, penalty = 'none').fit(bag_df,y_train)

In [32]:
lr.score(bag_df,y_train)

0.957850981093282

In [33]:
lr.predict(bag_df)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [34]:
lr.coef_

array([[-9.33648714e+00,  5.37725545e-01,  5.29086951e-01,
        -1.88619684e-02,  2.14989100e+00,  1.31087169e+00,
        -1.67491193e+00, -1.53415637e+00, -3.85368521e-01,
        -2.72381305e+00,  5.72048232e-01, -8.86814922e+00,
        -1.27013975e+01, -9.46782572e+00,  1.20541086e+00,
         2.20103266e+00,  9.31780950e-01,  2.46357653e+00,
        -5.23957275e-01, -1.00891726e+01, -1.27140946e+00,
         3.21856046e+01,  7.82197694e-01,  5.77257079e-01,
        -1.93937130e+00, -6.62183061e-01,  4.73755931e-01,
         9.24379044e-01, -3.98131512e+00, -1.92097513e+00,
        -1.26755005e+01,  1.00677452e+00,  6.60932912e-01,
         6.85066758e-01, -9.50446159e+00, -1.03456968e+00,
        -8.86861156e-01, -1.03146021e+01, -1.36072606e+00,
        -1.45687575e+00,  1.21615718e+00,  1.86148461e+00,
        -9.21428624e+00, -7.80093328e-01,  5.77470362e-01,
        -8.93305885e-01, -1.58902108e+00, -1.13289198e+01,
         6.03267030e-02, -1.33472119e+00, -6.27435623e-0

In [None]:
#fractional change