# Twitter Sentiment Analysis

In [36]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

import preprocess_nlp_Cesar_Arcos as pp

import pickle

In [3]:
df = pd.read_csv("twitt30k.csv")

In [6]:
df = df.rename(columns={'twitts': 'tweets'})

In [7]:
df

Unnamed: 0,tweets,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


In [9]:
df['sentiment'].value_counts()

1    15000
0    15000
Name: sentiment, dtype: int64

## SVM model and Data Preparation

In [12]:
def run_svm(df):
    X = df['tweets']
    y = df['sentiment']
    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=0,stratify = y)

    print('Shape of X: ',X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    
    return tfidf, clf    

In [14]:
%%time 
tfidf, clf = run_svm(df)

Shape of X:  (30000, 40854)
Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: user 798 ms, sys: 0 ns, total: 798 ms
Wall time: 796 ms


In [16]:
x = ["i am really happy thanks a lot for coming with me"]

In [17]:
clf.predict(tfidf.transform(x))

array([1])

## Data Cleaning and Retraining SVM

In [20]:
pp.__version__

'0.0.1'

In [21]:
df['tweets'] = df['tweets'].apply(lambda x: x.lower())

In [22]:
df['tweets'] = df['tweets'].apply(lambda x: pp.cont_exp(x))

In [23]:
df

Unnamed: 0,tweets,sentiment
0,@robbiebronniman sounds like a great night.,1
1,damn the person who stolde my wallet !!!!! ma...,1
2,greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars pretty pretty pretty please, pak...",0
...,...,...
29995,@calumfan1 is it in any way related to photosh...,0
29996,@swiz_nz really? wow thats crap,0
29997,"at the 2010 lexus hs250h press event. again, ...",0
29998,@karmicunderpath ooooh now there is a nice tho...,1


In [24]:
run_svm(df)

Shape of X:  (30000, 40846)
Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



(TfidfVectorizer(), LinearSVC())

In [25]:
df['tweets'] = df['tweets'].apply(lambda x: pp.remove_emails(x))
df['tweets'] = df['tweets'].apply(lambda x: pp.remove_urls(x))
df['tweets'] = df['tweets'].apply(lambda x: pp.remove_rt(x))
df['tweets'] = df['tweets'].apply(lambda x: pp.remove_html_tags(x))
df['tweets'] = df['tweets'].apply(lambda x: pp.remove_special_chars(x))


In [26]:
tfidf, clf = run_svm(df)

Shape of X:  (30000, 42931)
Printing Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [27]:
x

['i am really happy thanks a lot for coming with me']

In [28]:
clf.predict(tfidf.transform(x))

array([1])

## Fine Tuning Model 

In [33]:
def run_svm(df):
    X = df['tweets']
    y = df['sentiment']

    tfidf = TfidfVectorizer(norm = 'l1', ngram_range=(1,2), analyzer='word', max_features=5000)
    X = tfidf.fit_transform(X)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0, stratify = y)

    print('shape of X: ', X.shape)

    clf = LinearSVC()
    clf.fit(X_train, y_train)

    y_pred = clf.predict(X_test)

    print()
    print('Printing Report')
    print(classification_report(y_test, y_pred))
    
    return tfidf, clf

run_svm(df)

shape of X:  (30000, 5000)

Printing Report
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3000
           1       0.77      0.75      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(max_features=5000, ngram_range=(1, 2), norm='l1'),
 LinearSVC())

## Saving and Loading ML Model

In [37]:
pickle.dump(clf, open('clf.pkl', 'wb'))
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))

In [38]:
del clf
del tfidf

In [39]:
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [40]:
clf

LinearSVC()

In [41]:
x

['i am really happy thanks a lot for coming with me']

In [42]:
clf.predict(tfidf.transform(x))

array([1])

# Real-Time Twitter Sentiment Analysis

In [122]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

import preprocess_nlp_Cesar_Arcos as pp

import pickle
import json
import csv

from textblob import TextBlob

import tweepy
from Twitter_key import consumer_key,consumer_secret,access_token,access_token_secret

In [50]:
#!pip install tweepy
#!pip install git+ssh://git@github.com/racec9999/preprocess_nlp_Cesar_Arcos.git

In [95]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

api = tweepy.API(auth)

public_tweets = api.home_timeline()

In [96]:
public_tweets

[Status(_api=<tweepy.api.API object at 0x7fb49984e250>, _json={'created_at': 'Mon Mar 29 20:16:16 +0000 2021', 'id': 1376629300619804683, 'id_str': '1376629300619804683', 'text': 'La verdad, no nos sorprende que sean estos dos.\n¿Sus teclados también lucen así? https://t.co/1NytjK95uH', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [], 'urls': [], 'media': [{'id': 1376627689285640192, 'id_str': '1376627689285640192', 'indices': [81, 104], 'media_url': 'http://pbs.twimg.com/media/ExrDlvfWYAAHip_.png', 'media_url_https': 'https://pbs.twimg.com/media/ExrDlvfWYAAHip_.png', 'url': 'https://t.co/1NytjK95uH', 'display_url': 'pic.twitter.com/1NytjK95uH', 'expanded_url': 'https://twitter.com/XboxMexico/status/1376629300619804683/photo/1', 'type': 'photo', 'sizes': {'large': {'w': 800, 'h': 800, 'resize': 'fit'}, 'medium': {'w': 800, 'h': 800, 'resize': 'fit'}, 'thumb': {'w': 150, 'h': 150, 'resize': 'crop'}, 'small': {'w': 680, 'h': 680, 'resize': 'fit'}}}]}, 

In [97]:
type(public_tweets)

tweepy.models.ResultSet

In [98]:
public_tweets[2].text

"RT @CoinDeskMarkets: No wonder cryptocurrency markets were so lackluster last week: There just wasn't much appetite from investors to put n…"

## Tracking Keywords on Twitter

In [103]:
track_keyword = ['usa','china']

In [107]:
class MyStreamListener(tweepy.StreamListener):
    
    def on_status(self,status):
        print(status.text)
    
    def on_data(self, data):
        raw_tweets = json.loads(data)
        try:
            x = str(raw_tweets['text']).lower()
            x = pp.cont_exp(x)
            x = pp.remove_emails(x)
            x = pp.remove_html_tags(x)
            x = pp.remove_rt(x)
            x = pp.remove_special_chars(x)
            x = pp.remove_urls(x)
            
            blob = TextBlob(x)
            sentiment = blob.sentiment.polarity
            print(sentiment)
        except:
            pass
        
        
    def on_error(self,status_code):
        if status_code == 420:
            print('Error 420')
            return False # returnin False in on_error disconnects the stream

In [108]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener =myStreamListener )

In [109]:
myStream.filter(track=track_keyword)

0.2771428571428572
0.0
0.0
0.0
0.2333333333333333
0.0
0.0
0.2
0.4
0.0
0.2
-0.625
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.4
-0.3333333333333333
0.0
0.0
0.0
0.2333333333333333
0.13333333333333333
0.0
-0.6
0.0
0.08333333333333333
0.0
0.6000000000000001
0.0
0.0
0.2
0.0
0.25
0.0
-0.25
0.0
0.07777777777777777
0.0
0.0
0.0
0.0
0.0
0.0
0.0
-0.05
0.0
0.0
0.0
0.5
0.0
-0.25
0.0
0.14285714285714285
0.0
-0.125
-0.5
0.7
0.0
0.8
0.25
0.6875
0.6625
0.0
0.14285714285714285
0.0
0.0
0.0
-0.5
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.03333333333333333
0.1619047619047619
0.0
-0.2
0.0
0.0
-0.15000000000000002
-0.15625
0.05
0.0
-0.13333333333333333
0.08333333333333333
0.0
0.0
0.225
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


KeyboardInterrupt: 

In [111]:
clf = pickle.load(open('clf.pkl', 'rb'))
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [112]:
def predict_sentiment(x):
    x = [x]
    sent = clf.predict(tfidf.transform(x))
    return sent

In [126]:
track_keyword = ['usa','china']

In [127]:
usa = 0
china = 0

with open('sentiment.csv', 'w') as file:
    writer = csv.DictWriter(file, fieldnames=track_keyword)
    writer.writeheader()
    
    
class MyStreamListener(tweepy.StreamListener):
    
    def on_status(self,status):
        print(status.text)
    
    def on_data(self, data):
        raw_tweets = json.loads(data)
        try:
            x = str(raw_tweets['text']).lower()
            x = pp.cont_exp(x)
            x = pp.remove_emails(x)
            x = pp.remove_html_tags(x)
            x = pp.remove_rt(x)
            x = pp.remove_special_chars(x)
            x = pp.remove_urls(x)
            
            #blob = TextBlob(x)
            #sentiment = predict_sentiment(x)[0]
            #print(sentiment)
    
            
            
            global china
            global usa
            
            if 'usa' in x and 'china' not in x:
                sent = predict_sentiment(x)[0]
                usa = usa + sent
                
            elif 'china' in x and 'usa' not in x:
                sent = predict_sentiment(x)[0]
                china = china + sent
                
            else:
                pass
            
            
        except:
            pass
        print('Usa: ',usa, 'China: ',china)
        
        with open('sentiment.csv', 'a') as file:
                writer = csv.DictWriter(file, fieldnames=track_keyword)
                info = {
                    'usa': usa,
                    'china': china
                }
                writer.writerow(info)
        
    def on_error(self,status_code):
        if status_code == 420:
            print('Error 420')
            return False # returnin False in on_error disconnects the stream

In [128]:
myStreamListener = MyStreamListener()
myStream = tweepy.Stream(auth = api.auth, listener =myStreamListener )

In [129]:
myStream.filter(track=track_keyword)

Usa:  1 China:  0
Usa:  1 China:  0
Usa:  2 China:  0
Usa:  2 China:  1
Usa:  2 China:  1
Usa:  3 China:  1
Usa:  3 China:  1
Usa:  3 China:  2
Usa:  3 China:  2
Usa:  3 China:  3
Usa:  3 China:  4
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  3 China:  5
Usa:  4 China:  5
Usa:  4 China:  5
Usa:  4 China:  5
Usa:  4 China:  6
Usa:  4 China:  6
Usa:  4 China:  6
Usa:  5 China:  6
Usa:  5 China:  6
Usa:  5 China:  6
Usa:  5 China:  7
Usa:  5 China:  7
Usa:  5 China:  8
Usa:  5 China:  8
Usa:  5 China:  8
Usa:  5 China:  8
Usa:  5 China:  8
Usa:  5 China:  8
Usa:  5 China:  8
Usa:  5 China:  9
Usa:  5 China:  9
Usa:  5 China:  9
Usa:  5 China:  9
Usa:  5 China:  9
Usa:  5 China:  9
Usa:  5 China:  9
Usa:  6 China:  9
Usa:  6 China:  9
Usa:  6 China:  9
Usa:  6 China:  9
Usa:  6 China:  9
Usa:  6 China:  9
Usa:  6 China:  9
Usa:  6 China:  9
Usa:  6 China:  10
Usa:  6 C



Usa:  8 China:  13
Usa:  8 China:  13
Usa:  9 China:  13
Usa:  10 China:  13
Usa:  11 China:  13
Usa:  11 China:  13
Usa:  11 China:  14
Usa:  11 China:  14
Usa:  11 China:  14
Usa:  12 China:  14
Usa:  12 China:  14
Usa:  12 China:  15
Usa:  12 China:  15
Usa:  13 China:  15
Usa:  13 China:  15
Usa:  13 China:  16
Usa:  13 China:  16
Usa:  14 China:  16
Usa:  14 China:  16
Usa:  14 China:  16
Usa:  14 China:  16
Usa:  14 China:  16
Usa:  14 China:  16
Usa:  14 China:  17
Usa:  14 China:  17
Usa:  14 China:  17
Usa:  14 China:  17
Usa:  14 China:  18
Usa:  14 China:  18
Usa:  14 China:  18
Usa:  14 China:  18
Usa:  14 China:  18
Usa:  14 China:  18
Usa:  14 China:  18
Usa:  14 China:  18
Usa:  15 China:  18
Usa:  15 China:  18
Usa:  16 China:  18
Usa:  16 China:  18
Usa:  16 China:  18
Usa:  16 China:  18
Usa:  16 China:  19
Usa:  16 China:  19
Usa:  16 China:  19
Usa:  16 China:  19
Usa:  17 China:  19
Usa:  18 China:  19
Usa:  18 China:  19
Usa:  18 China:  20
Usa:  18 China:  20
Usa



Usa:  19 China:  22
Usa:  19 China:  22
Usa:  19 China:  22
Usa:  19 China:  23
Usa:  19 China:  24
Usa:  19 China:  24
Usa:  19 China:  24
Usa:  19 China:  24
Usa:  20 China:  24
Usa:  21 China:  24
Usa:  21 China:  25
Usa:  21 China:  25
Usa:  21 China:  25
Usa:  22 China:  25
Usa:  22 China:  25
Usa:  22 China:  25
Usa:  22 China:  25
Usa:  22 China:  25
Usa:  22 China:  25
Usa:  23 China:  25
Usa:  23 China:  25
Usa:  24 China:  25
Usa:  24 China:  25
Usa:  25 China:  25
Usa:  26 China:  25
Usa:  27 China:  25
Usa:  27 China:  25
Usa:  27 China:  25
Usa:  27 China:  26
Usa:  28 China:  26
Usa:  28 China:  26
Usa:  28 China:  26
Usa:  28 China:  26
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  27
Usa:  28 China:  28
Usa:  28 China:  29
Usa:  28 China:  29
Usa:  29 China:  29
Usa:  29 China:  30
Usa:  29 China:  30
Usa:  29 China:  30




Usa:  45 China:  39
Usa:  46 China:  39
Usa:  46 China:  39
Usa:  46 China:  39
Usa:  46 China:  40
Usa:  46 China:  40
Usa:  47 China:  40
Usa:  47 China:  40
Usa:  48 China:  40
Usa:  49 China:  40
Usa:  49 China:  40
Usa:  49 China:  40
Usa:  50 China:  40
Usa:  50 China:  40
Usa:  50 China:  40
Usa:  50 China:  40
Usa:  50 China:  40
Usa:  50 China:  40
Usa:  51 China:  40
Usa:  51 China:  40
Usa:  51 China:  40
Usa:  52 China:  40
Usa:  52 China:  40
Usa:  52 China:  40
Usa:  53 China:  40
Usa:  53 China:  40
Usa:  53 China:  40
Usa:  53 China:  41
Usa:  53 China:  41
Usa:  54 China:  41
Usa:  54 China:  42
Usa:  54 China:  42
Usa:  54 China:  42
Usa:  55 China:  42
Usa:  55 China:  42
Usa:  55 China:  42
Usa:  55 China:  42
Usa:  56 China:  42
Usa:  56 China:  42
Usa:  56 China:  42
Usa:  57 China:  42
Usa:  57 China:  42
Usa:  58 China:  42
Usa:  58 China:  42
Usa:  58 China:  42
Usa:  58 China:  42
Usa:  58 China:  42
Usa:  58 China:  42
Usa:  58 China:  42
Usa:  58 China:  42




Usa:  85 China:  62
Usa:  85 China:  63
Usa:  85 China:  63
Usa:  85 China:  63
Usa:  85 China:  64
Usa:  86 China:  64
Usa:  86 China:  64
Usa:  86 China:  64
Usa:  86 China:  65
Usa:  86 China:  65




Usa:  86 China:  65
Usa:  86 China:  65
Usa:  87 China:  65
Usa:  87 China:  65
Usa:  87 China:  65
Usa:  87 China:  66
Usa:  87 China:  67
Usa:  87 China:  67
Usa:  88 China:  67
Usa:  88 China:  67
Usa:  88 China:  67
Usa:  89 China:  67
Usa:  89 China:  67
Usa:  89 China:  67
Usa:  89 China:  67
Usa:  89 China:  67
Usa:  90 China:  67
Usa:  90 China:  67
Usa:  90 China:  67
Usa:  90 China:  67
Usa:  90 China:  67
Usa:  90 China:  67
Usa:  90 China:  67
Usa:  90 China:  67
Usa:  90 China:  68
Usa:  90 China:  69
Usa:  90 China:  69
Usa:  90 China:  69
Usa:  90 China:  69
Usa:  91 China:  69
Usa:  92 China:  69
Usa:  92 China:  69
Usa:  92 China:  69
Usa:  92 China:  69
Usa:  92 China:  69
Usa:  92 China:  70
Usa:  93 China:  70
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71
Usa:  93 China:  71




Usa:  94 China:  71
Usa:  94 China:  71
Usa:  95 China:  71
Usa:  95 China:  71
Usa:  95 China:  71
Usa:  95 China:  71
Usa:  96 China:  71
Usa:  96 China:  71
Usa:  96 China:  72
Usa:  96 China:  72
Usa:  96 China:  72
Usa:  96 China:  72
Usa:  96 China:  73
Usa:  97 China:  73
Usa:  97 China:  73
Usa:  97 China:  73
Usa:  97 China:  73
Usa:  97 China:  73
Usa:  97 China:  73
Usa:  98 China:  73
Usa:  99 China:  73
Usa:  100 China:  73
Usa:  100 China:  74
Usa:  100 China:  75
Usa:  100 China:  75
Usa:  100 China:  75
Usa:  100 China:  75
Usa:  101 China:  75
Usa:  101 China:  75
Usa:  101 China:  75
Usa:  101 China:  75
Usa:  101 China:  75
Usa:  101 China:  75
Usa:  101 China:  75
Usa:  101 China:  75
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  101 China:  76
Usa:  102 China:  76
Usa:  102 Chi



Usa:  110 China:  83
Usa:  110 China:  83
Usa:  111 China:  83
Usa:  111 China:  83
Usa:  111 China:  83
Usa:  111 China:  83
Usa:  111 China:  83
Usa:  111 China:  84
Usa:  111 China:  84
Usa:  111 China:  84
Usa:  111 China:  84
Usa:  112 China:  84
Usa:  113 China:  84
Usa:  113 China:  84
Usa:  114 China:  84
Usa:  114 China:  84
Usa:  114 China:  84
Usa:  114 China:  84
Usa:  114 China:  84
Usa:  115 China:  84
Usa:  115 China:  84
Usa:  116 China:  84
Usa:  117 China:  84
Usa:  117 China:  84
Usa:  117 China:  84
Usa:  118 China:  84
Usa:  118 China:  84
Usa:  118 China:  84
Usa:  118 China:  84
Usa:  118 China:  84
Usa:  118 China:  84
Usa:  118 China:  85
Usa:  118 China:  85
Usa:  118 China:  85
Usa:  118 China:  85
Usa:  118 China:  86
Usa:  118 China:  86
Usa:  118 China:  86
Usa:  118 China:  86
Usa:  118 China:  86
Usa:  118 China:  86
Usa:  119 China:  86
Usa:  119 China:  86
Usa:  119 China:  86
Usa:  119 China:  87
Usa:  119 China:  87
Usa:  119 China:  87
Usa:  119 Chi

KeyboardInterrupt: 