<a href="https://colab.research.google.com/github/DenisOgr/sentiment-batch-stream-pipeline/blob/main/experiments/experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Experiment 1

[Source: Another Twitter sentiment analysis with Python ](https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-11-cnn-word2vec-41f5e28eda74)

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk.tokenize import WordPunctTokenizer
tok = WordPunctTokenizer()

In [None]:

pat1 = r'@[A-Za-z0-9_]+'
pat2 = r'https?://[^ ]+'
combined_pat = r'|'.join((pat1, pat2))
www_pat = r'www.[^ ]+'
negations_dic = {"isn't":"is not", "aren't":"are not", "wasn't":"was not", "weren't":"were not",
                "haven't":"have not","hasn't":"has not","hadn't":"had not","won't":"will not",
                "wouldn't":"would not", "don't":"do not", "doesn't":"does not","didn't":"did not",
                "can't":"can not","couldn't":"could not","shouldn't":"should not","mightn't":"might not",
                "mustn't":"must not"}
neg_pattern = re.compile(r'\b(' + '|'.join(negations_dic.keys()) + r')\b')


In [None]:
cols = ['sentiment','id','date','query_string','user','text']
df_train = (pd.read_csv(
    '/content/drive/MyDrive/sentiment-batch-stream-pipeline/trainingandtestdata/training.1600000.processed.noemoticon.csv',
     header=None, names = cols, engine='python'
    ))

In [None]:
df_train = df_train[['sentiment','text']]

### Preprocessing

In [None]:
def tweet_cleaner_updated(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    try:
        bom_removed = souped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        bom_removed = souped
    stripped = re.sub(combined_pat, '', bom_removed)
    stripped = re.sub(www_pat, '', stripped)
    lower_case = stripped.lower()
    neg_handled = neg_pattern.sub(lambda x: negations_dic[x.group()], lower_case)
    letters_only = re.sub("[^a-zA-Z]", " ", neg_handled)
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = [x for x  in tok.tokenize(letters_only) if len(x) > 1]
    return (" ".join(words)).strip()

In [None]:
df_train['sentiment'] = df_train['sentiment'].map({0: 0, 4: 1})

In [None]:
%%time
print("Cleaning the tweets...\n")
clean_tweet_texts = []
for i in range(0,len(df_train)):
    if( (i+1)%100000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, len(df_train) ))
    clean_tweet_texts.append(tweet_cleaner_updated(df_train['text'][i]))

Cleaning the tweets...

Tweets 100000 of 1600000 has been processed
Tweets 200000 of 1600000 has been processed
Tweets 300000 of 1600000 has been processed
Tweets 400000 of 1600000 has been processed
Tweets 500000 of 1600000 has been processed
Tweets 600000 of 1600000 has been processed
Tweets 700000 of 1600000 has been processed
Tweets 800000 of 1600000 has been processed
Tweets 900000 of 1600000 has been processed
Tweets 1000000 of 1600000 has been processed
Tweets 1100000 of 1600000 has been processed
Tweets 1200000 of 1600000 has been processed
Tweets 1300000 of 1600000 has been processed
Tweets 1400000 of 1600000 has been processed
Tweets 1500000 of 1600000 has been processed
Tweets 1600000 of 1600000 has been processed
CPU times: user 6min 17s, sys: 23.6 s, total: 6min 40s
Wall time: 6min 41s


In [None]:
clean_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_df['target'] = df_train.sentiment
clean_df.to_csv('clean_tweet.csv',encoding='utf-8')

In [None]:
clean_df.head()

Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


### Features engineering

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [None]:
vectorizer = CountVectorizer(max_features=80000, ngram_range=(1,3))
lr 

#### Model building

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
lr =LogisticRegression(max_iter=5000)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(clean_df.text,clean_df.target, test_size=.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1280000,), (320000,), (1280000,), (320000,))

In [None]:
pipeline = Pipeline([('vect', vectorizer),('lr', lr)])


In [None]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=80000, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('lr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=5000,
                                    multi_class='auto', n_jobs=None,
                         

#### Model evoluationg

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
def score(y_true, y_pred):
  print('Accuracy: ', accuracy_score(y_true, y_pred))
  print('Confusion matrix: ', confusion_matrix(y_true, y_pred))
  print('Classification report: ', classification_report(y_true, y_pred))


In [None]:
score(y_test.values, y_pred)

Accuracy:  0.817871875
Confusion matrix:  [[128039  31861]
 [ 26420 133680]]
Classification report:                precision    recall  f1-score   support

           0       0.83      0.80      0.81    159900
           1       0.81      0.83      0.82    160100

    accuracy                           0.82    320000
   macro avg       0.82      0.82      0.82    320000
weighted avg       0.82      0.82      0.82    320000



## Experiment 2

In [None]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

[Source: Depression On Social Media](https://www.kaggle.com/redaabdou/depression-on-social-media)

In [None]:
cols = ['sentiment','id','date','query_string','user','text']
df_train = (pd.read_csv(
    '/content/drive/MyDrive/sentiment-batch-stream-pipeline/trainingandtestdata/training.1600000.processed.noemoticon.csv',
     header=None, names = cols, engine='python'
    ))
df_train['sentiment'] = df_train['sentiment'].map({0: 0, 4: 1})

In [None]:
df_train = df_train[['sentiment','text']]

#### Data Cleaning

In [None]:
df_train['text_clean'] = df_train['text'].str.replace("@", "") 
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","switchfoot http://twitpic.com/2y1zl - Awww, th..."
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...,Kenichan I dived many times for the ball. Mana...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","nationwideclass no, it's not behaving at all. ..."


In [None]:
df_train['text_clean'] = df_train['text_clean'].str.replace(r"http\S+", "") 
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","switchfoot - Awww, that's a bummer. You shou..."
1,0,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...,Kenichan I dived many times for the ball. Mana...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....","nationwideclass no, it's not behaving at all. ..."


In [None]:
df_train['text_clean'] = df_train['text_clean'].str.replace("[^a-zA-Z]", " ") 
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot Awww that s a bummer You shou...
1,0,is upset that he can't update his Facebook by ...,is upset that he can t update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...,Kenichan I dived many times for the ball Mana...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all....",nationwideclass no it s not behaving at all ...


In [None]:
stopwords=nltk.corpus.stopwords.words('english')
def remove_stopwords(text):
    clean_text=' '.join([word for word in text.split() if word not in stopwords])
    return clean_text

In [None]:
df_train['text_clean'] = df_train['text_clean'].apply(lambda text : remove_stopwords(text.lower()))
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","switchfoot http://twitpic.com/2y1zl - awww, th..."
1,0,is upset that he can't update his Facebook by ...,upset can't update facebook texting it... migh...
2,0,@Kenichan I dived many times for the ball. Man...,kenichan dived many times ball. managed save 5...
3,0,my whole body feels itchy and like its on fire,whole body feels itchy like fire
4,0,"@nationwideclass no, it's not behaving at all....","nationwideclass no, behaving all. i'm mad. her..."


In [None]:
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: x.split())
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, http://twitpic.com/2y1zl, -, awww..."
1,0,is upset that he can't update his Facebook by ...,"[upset, can't, update, facebook, texting, it....."
2,0,@Kenichan I dived many times for the ball. Man...,"[kenichan, dived, many, times, ball., managed,..."
3,0,my whole body feels itchy and like its on fire,"[whole, body, feels, itchy, like, fire]"
4,0,"@nationwideclass no, it's not behaving at all....","[nationwideclass, no,, behaving, all., i'm, ma..."


In [None]:
from nltk.stem.porter import * 
stemmer = PorterStemmer() 
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: [stemmer.stem(i) for i in x])
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[switchfoot, http://twitpic.com/2y1zl, -, awww..."
1,0,is upset that he can't update his Facebook by ...,"[upset, can't, updat, facebook, text, it..., m..."
2,0,@Kenichan I dived many times for the ball. Man...,"[kenichan, dive, mani, time, ball., manag, sav..."
3,0,my whole body feels itchy and like its on fire,"[whole, bodi, feel, itchi, like, fire]"
4,0,"@nationwideclass no, it's not behaving at all....","[nationwideclass, no,, behav, all., i'm, mad.,..."


In [None]:
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: ' '.join([w for w in x]))
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","switchfoot http://twitpic.com/2y1zl - awww, th..."
1,0,is upset that he can't update his Facebook by ...,upset can't updat facebook text it... might cr...
2,0,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball. manag save 50% r...
3,0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....","nationwideclass no, behav all. i'm mad. here? ..."


In [None]:
df_train['text_clean'] = df_train['text_clean'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
df_train.head()

Unnamed: 0,sentiment,text,text_clean
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","switchfoot http://twitpic.com/2y1zl awww, that..."
1,0,is upset that he can't update his Facebook by ...,upset can't updat facebook text it... might re...
2,0,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball. manag save rest ...
3,0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",nationwideclass behav all. mad. here? can't th...


In [None]:
df_train.to_csv('clean_tweet_2.csv',encoding='utf-8')

#### Model building

##### aproach 1

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_train.text_clean, df_train.sentiment, test_size=.2)

In [None]:
pipeline = Pipeline([('vect', CountVectorizer(stop_words='english')),('lr', LogisticRegression(max_iter=500))])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred_2 = pipeline.predict(X_test)

AttributeError: ignored

##### aproach 2

In [None]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=100000) 
cv = count_vectorizer.fit_transform(df_train['text_clean'])
cv.shape

(1600000, 100000)

In [None]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(cv, df_train.sentiment, test_size=.2)

In [None]:
lr2 = LogisticRegression(max_iter=5000).fit(X_train_2, y_train_2)

In [None]:
y_pred_2 = lr2.predict(X_test_2)

In [None]:
score(y_test_2.values, y_pred_2)

Accuracy:  0.759009375
Confusion matrix:  [[118132  41934]
 [ 35183 124751]]
Classification report:                precision    recall  f1-score   support

           0       0.77      0.74      0.75    160066
           1       0.75      0.78      0.76    159934

    accuracy                           0.76    320000
   macro avg       0.76      0.76      0.76    320000
weighted avg       0.76      0.76      0.76    320000



#### Model evoluationg

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
def score(y_true, y_pred):
  print('Accuracy: ', accuracy_score(y_true, y_pred))
  print('Confusion matrix: ', confusion_matrix(y_true, y_pred))
  print('Classification report: ', classification_report(y_true, y_pred))


In [None]:
score(y_test.values, y_pred)

Accuracy:  0.500540625
Confusion matrix:  [[77202 82570]
 [77257 82971]]
Classification report:                precision    recall  f1-score   support

           0       0.50      0.48      0.49    159772
           1       0.50      0.52      0.51    160228

    accuracy                           0.50    320000
   macro avg       0.50      0.50      0.50    320000
weighted avg       0.50      0.50      0.50    320000



In [None]:
X_train

1034436    work song buddi lilplayi imma make proud shorr...
1506228    jomargarcia followersflood linisnia rosamondbr...
728988            piyushchitkara doesnt everyon choic though
30185                 syalam need iphon fulli enjoy trapster
112051                                           arbor remov
                                 ...                        
1300593                              tahnini shall bear mind
546691                                    knee accident hurt
316381                                          colleg wanna
142420     anoopdoggdesai home watch definit it!!! can't ...
1312567    websit final finish remod http://www.clickedap...
Name: text_clean, Length: 1280000, dtype: object