<a href="https://colab.research.google.com/github/CRSpradlin/natural-language-processing-course/blob/main/NLP%20Course%20Work/10.%20Twitter%20Sentiment%20Analysis/TwitterSentimentAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [70]:
# data link: https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv

In [71]:
import numpy as np
import pandas as pd


In [72]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.feature_extraction.text import TfidfVectorizer

In [73]:
df = pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-data/master/twitt30k.csv')
df

Unnamed: 0,twitts,sentiment
0,@robbiebronniman Sounds like a great night.,1
1,Damn the person who stolde my wallet !!!!! Ma...,1
2,Greetings from the piano bench (photo) http:/...,1
3,@drewryanscott i love it!! i love you!! haha f...,1
4,"@kissthestars Pretty pretty pretty please, pak...",0
...,...,...
29995,@Calumfan1 is it in any way related to photosh...,0
29996,@Swiz_NZ really? wow thats crap,0
29997,"At the 2010 lexus HS250h press event. Again, ...",0
29998,@karmicunderpath ooooh now there's a nice thou...,1


1's are a positive sentiment and 0's are a negative sentiment.

In [74]:
df['sentiment'].value_counts()

sentiment
1    15000
0    15000
Name: count, dtype: int64

## SVM Model and Data Preparation

In [75]:
tweet_content = df['twitts']
tweet_sentiment = df['sentiment']

tfidf = TfidfVectorizer()
x = tfidf.fit_transform(tweet_content)

x_train, x_test, y_train, y_test = train_test_split(x, tweet_sentiment, test_size = 0.2, random_state = 0, stratify = tweet_sentiment)

print('shape of x: ', x.shape)

clf = LinearSVC()
clf.fit(x_train, y_train)

y_pred = clf.predict(x_test)

shape of x:  (30000, 40854)


In [76]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



In [77]:
def run_svm(df):
  tweet_content = df['twitts']
  tweet_sentiment = df['sentiment']

  tfidf = TfidfVectorizer()
  x = tfidf.fit_transform(tweet_content)

  x_train, x_test, y_train, y_test = train_test_split(x, tweet_sentiment, test_size = 0.2, random_state = 0, stratify = tweet_sentiment)

  print('shape of x: ', x.shape)

  clf = LinearSVC()
  clf.fit(x_train, y_train)

  y_pred = clf.predict(x_test)

  print("\nClassification Report")
  print(classification_report(y_test, y_pred))

  return tfidf, clf

In [78]:
%%time
tfidf, clf = run_svm(df)

shape of x:  (30000, 40854)

Classification Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000

CPU times: user 1.27 s, sys: 4.33 ms, total: 1.27 s
Wall time: 1.32 s


Now to test the model using the generated clf.

In [79]:
x = 'i am excited to see my best friend to get married'

In [80]:
clf.predict(tfidf.transform([x]))

array([1])

The example has been deemed as a positive sentiment statement.

## Data Cleaning and Retraining SVM
Now it is time to use the python package we created to help perform some data cleaning.

In [81]:
!pip install "git+https://github.com/CRSpradlin/natural-language-processing-course.git#egg=preprocess_crspradlin&subdirectory=NLP Course Work/7. Packaging/Preprocessing Package"

Collecting preprocess_crspradlin
  Cloning https://github.com/CRSpradlin/natural-language-processing-course.git to /tmp/pip-install-4syb8cdt/preprocess-crspradlin_6dde1dfba2ec4ce589945e9ebf0016ba
  Running command git clone --filter=blob:none --quiet https://github.com/CRSpradlin/natural-language-processing-course.git /tmp/pip-install-4syb8cdt/preprocess-crspradlin_6dde1dfba2ec4ce589945e9ebf0016ba
  Resolved https://github.com/CRSpradlin/natural-language-processing-course.git to commit 830da3e14a4beac7c6d6abfcbeeb006fea9e1245
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [82]:
import preprocess_crspradlin as prep

In [83]:
prep.__version__

'0.0.1'

In [84]:
# Apply lowercase modifier
df['twitts'] = df.twitts.apply(lambda x: x.lower())

In [85]:
# Apply Contraction to Expansion modifier
df['twitts'] = df.twitts.apply(lambda x: prep.get_contraction_to_expansion(x))

In [86]:
run_svm(df)

shape of x:  (30000, 40850)

Classification Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.75      3000
           1       0.75      0.76      0.75      3000

    accuracy                           0.75      6000
   macro avg       0.75      0.75      0.75      6000
weighted avg       0.75      0.75      0.75      6000



(TfidfVectorizer(), LinearSVC())

Not a major improvement overall but there is little improvement in recall.

In [87]:
# Apply removal modifiers
df['twitts'] = df.twitts.apply(lambda x: prep.remove_emails(x))
df['twitts'] = df.twitts.apply(lambda x: prep.remove_urls(x))
df['twitts'] = df.twitts.apply(lambda x: prep.remove_rt(x))
df['twitts'] = df.twitts.apply(lambda x: prep.remove_html_tags(x))
df['twitts'] = df.twitts.apply(lambda x: prep.remove_special_chars(x))

  return BeautifulSoup(x, 'lxml').get_text().strip()


In [88]:
tfidf, clf = run_svm(df)

shape of x:  (30000, 43375)

Classification Report
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      3000
           1       0.74      0.75      0.75      3000

    accuracy                           0.74      6000
   macro avg       0.74      0.74      0.74      6000
weighted avg       0.74      0.74      0.74      6000



Still no change in results but the data has been "cleaned up".

In [89]:
clf.predict(tfidf.transform([x]))

array([1])

## Fine Tuning the Model
Trying to get the model to have a higher percentage of accuracy.

In [90]:
def run_svm(df):
  tweet_content = df['twitts']
  tweet_sentiment = df['sentiment']

  ## UPDATES HERE ##
  tfidf = TfidfVectorizer(norm = 'l1')
  ####
  x = tfidf.fit_transform(tweet_content)

  x_train, x_test, y_train, y_test = train_test_split(x, tweet_sentiment, test_size = 0.2, random_state = 0, stratify = tweet_sentiment)

  print('shape of x: ', x.shape)

  clf = LinearSVC()
  clf.fit(x_train, y_train)

  y_pred = clf.predict(x_test)

  print("\nClassification Report")
  print(classification_report(y_test, y_pred))

  return tfidf, clf

In [91]:
run_svm(df)

shape of x:  (30000, 43375)

Classification Report
              precision    recall  f1-score   support

           0       0.76      0.75      0.76      3000
           1       0.75      0.76      0.76      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



(TfidfVectorizer(norm='l1'), LinearSVC())

With the changes to the TfidfVectorizer, you can see an increase to the results. Lets try to make additional adjustments to see how high the percentages get.

In [92]:
def run_svm(df):
  tweet_content = df['twitts']
  tweet_sentiment = df['sentiment']

  ## UPDATES HERE ##
  tfidf = TfidfVectorizer(norm = 'l1', ngram_range=(1,5), analyzer='word', max_features=5000)
  ####
  x = tfidf.fit_transform(tweet_content)

  x_train, x_test, y_train, y_test = train_test_split(x, tweet_sentiment, test_size = 0.2, random_state = 0, stratify = tweet_sentiment)

  print('shape of x: ', x.shape)

  clf = LinearSVC()
  clf.fit(x_train, y_train)

  y_pred = clf.predict(x_test)

  print("\nClassification Report")
  print(classification_report(y_test, y_pred))

  return tfidf, clf

In [93]:
tfidf, clf = run_svm(df)

shape of x:  (30000, 5000)

Classification Report
              precision    recall  f1-score   support

           0       0.75      0.77      0.76      3000
           1       0.76      0.74      0.75      3000

    accuracy                           0.76      6000
   macro avg       0.76      0.76      0.76      6000
weighted avg       0.76      0.76      0.76      6000



## Saving and Loading ML Model

In [94]:
import pickle

In [95]:
pickle.dump(clf, open('clf.pkl', 'wb')) # Write obj in write-binary mode
pickle.dump(tfidf, open('tfidf.pkl', 'wb'))

In [96]:
del clf
del tfidf
# print(clf) # fails since clf and tfidf are no longer defined

In [97]:
clf = pickle.load(open('clf.pkl', 'rb')) # open file in read-binary mode

In [98]:
tfidf = pickle.load(open('tfidf.pkl', 'rb'))

In [99]:
clf

In [100]:
len(tfidf.vocabulary_)

5000

In [101]:
clf.predict(tfidf.transform([x]))

array([1])

## Real-Time Twitter Sentiment Analysis

In [102]:
!pip install tweepy

from google.colab import userdata
# userdata.get('consumer_key') # X developer Client ID
# userdata.get('consumer_secret') # X developer Client Secret
# userdata.get('access_token') # X developer Access Token
# userdata.get('access_token_secret') # X developer Access Secret



In [103]:
import tweepy

auth = tweepy.OAuth1UserHandler(
    userdata.get('consumer_key'), userdata.get('consumer_secret'), userdata.get('access_token'), userdata.get('access_token_secret')
)


api = tweepy.API(auth)
print(api.verify_credentials().screen_name)

crspradlin_dev


Twitter/X no longer supports a fre-tier of collecting tweets via timeline or query unless you have a paid developer account (100 USD/month).

I was able to set up an account and have it display my username. That is all.