## Sentiment Analysis

In [2]:
import ssl, sqlite3, pandas as pd
from pathlib import Path

# Create secure context
ssl._create_default_https_context = ssl._create_unverified_context

# Function to load SQL table to DataFrame
def load_sql(db_name, tbl_name):
  """Load SQLite database."""
  con = sqlite3.connect(f'database/{db_name}.db')
  df = pd.read_sql(f"SELECT * FROM {tbl_name}", con)
  con.close()
  return df

# Function to save DataFrame to SQLite database
def save_sql(df, filename, action="replace"):
  """Save dataframe to SQLite. Available actions: replace, append."""
  db_name = Path(f'database/{filename}.db')
  db_name.parent.mkdir(parents=True, exist_ok=True)
  con = sqlite3.connect(db_name)
  df.to_sql(filename, con, index=False, if_exists=action)

In [3]:
# Load preprocessed data
df = load_sql('tweets_v7', 'tweets_v7')

### Rule-based SA

In [4]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()

def rulebased_sentiment(text):
  sent = sa.polarity_scores(text)
  return sent['compound'] > 0

In [5]:
df['rulebased_sent'] = df['text'].apply(rulebased_sentiment)
df[['text', 'rulebased_sent']].sample(3)

Unnamed: 0,text,rulebased_sent
1344,missile strike strike poland today deliberate ...,False
1317,poor canadian duram wheat harvest late 2021 pu...,False
81,mass housing ukraine war present urban graphic...,True


In [6]:
save_sql(df, 'tweets_v5')

### Supervised ML SA
#### Bag of Words

In [7]:
pd.set_option('display.width', 100)

from nltk.tokenize import casual_tokenize
from collections import Counter

In [8]:
bag_of_words = []

for text in df.text:
  bag_of_words.append(Counter(casual_tokenize(text)))
  
df_bows = pd.DataFrame.from_records(bag_of_words)
df_bows = df_bows.fillna(0).astype(int)

In [9]:
print(df_bows.shape)
df_bows.sample(10).T

(2893, 6712)


Unnamed: 0,639,1211,145,2863,1432,2465,1349,790,1241,604
it,0,0,0,1,0,0,0,1,0,0
',1,0,1,1,0,0,1,1,1,1
s,3,0,1,1,0,0,1,1,1,0
easy,0,0,0,0,0,0,0,0,0,0
pull,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
prevailed,0,0,0,0,0,0,0,0,0,0
lat,0,0,0,0,0,0,0,0,0,0
prolong,0,0,0,0,0,0,0,0,0,0
agony,0,0,0,0,0,0,0,0,0,0


In [10]:

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(df_bows, df['rulebased_sent'])

MultinomialNB()

In [11]:
# df['nb_sent'] = nb.predict(df_bows)
nb_sent = nb.predict(df_bows)

In [12]:
nb_sent

array([ True, False,  True, ..., False, False, False])

In [None]:
# save_sql(df, 'tweets_v5')

#### Support Vector Machines

In [13]:
df = load_sql('tweets_v5', 'tweets_v5')

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], 
                                                    df['rulebased_sent'],
                                                    test_size=0.2,
                                                    train_size=0.8,
                                                    random_state=42,
                                                    shuffle=True,
                                                    stratify=df['rulebased_sent'])

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Train ML model
tfidf = TfidfVectorizer()

X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [16]:
print(X_train_tf.shape)
print(X_test_tf.shape)

(2314, 5799)
(579, 5799)


In [17]:
from sklearn import svm

model = svm.SVC(kernel='linear', max_iter=-1)
model.fit(X_train_tf, y_train)

SVC(kernel='linear')

In [18]:
# Model evaluation
Y_pred = model.predict(X_test_tf)

In [19]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

print('Accuracy score: ', accuracy_score(y_test, Y_pred))
print(classification_report(y_test, Y_pred))
print(confusion_matrix(y_test, Y_pred))

Accuracy score:  0.8221070811744386
              precision    recall  f1-score   support

           0       0.82      0.95      0.88       398
           1       0.84      0.54      0.65       181

    accuracy                           0.82       579
   macro avg       0.83      0.74      0.77       579
weighted avg       0.82      0.82      0.81       579

[[379  19]
 [ 84  97]]


In [None]:
frame = { 'text': X_test, 'actual': y_test, 'predicted': Y_pred }
result = pd.DataFrame(frame)
result

In [None]:
tfidf = TfidfVectorizer(max_features=5799, ngram_range=(1,1))

X_new = df['text']
y_new = df['rulebased_sent']

X_new = tfidf.fit_transform(X_new)

Y_pred_new = model.predict(X_new)

In [None]:
frame = {'text': df['text'], 'actual': y_new, 'predicted': Y_pred_new }
result = pd.DataFrame(frame)
result

In [None]:
df2 = df.copy()
df2['svm_sent'] = Y_pred_new

In [None]:
df2.head(5)

In [None]:
save_sql(df2, 'tweets_v6')