<h1>Modelling</h1>

<h3>Importing Model Ready Data</h3>

In [20]:
import pandas as pd

model_tweets = pd.read_csv('../data/model_ready_data.csv')
model_tweets = model_tweets.fillna("")
model_tweets.head()

Unnamed: 0,sentiment,lang,hashtags,clean_text
0,1,en,,rt telglobalhealth africa is in the midst of a...
1,1,en,,rt globalhlthtwit dr moeti is head of who in a...
2,1,en,red4research,rt nhsrdforum thank you research note for crea...
3,1,en,,rt highwiretalk former pfizer vp and virologis...
4,1,en,,rt peterhotez i think it s important that we d...


In [21]:
model_tweets.shape

(6417, 4)

In [22]:
# 4492 1925
model_tweets.drop(model_tweets[model_tweets['sentiment'] == -1].index, inplace=True)
model_tweets.reset_index(drop=True, inplace=True)
tweet_train = model_tweets.iloc[:2640,:]
tweet_test = model_tweets.iloc[2641:,:]

<h3>Sentiment Analysis</h3>

In [23]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from sklearn.decomposition import NMF, LatentDirichletAllocation

#### Unigram Counts

In [24]:
unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
unigram_vectorizer.fit(tweet_train['clean_text'].values)

CountVectorizer()

In [25]:
X_train_unigram = unigram_vectorizer.transform(tweet_train['clean_text'].values)

#### Unigram Tf-Idf

In [26]:
unigram_tf_idf_transformer = TfidfTransformer()
unigram_tf_idf_transformer.fit(X_train_unigram)


TfidfTransformer()

In [27]:
X_train_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_train_unigram)

#### Bigram Counts

In [28]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
bigram_vectorizer.fit(tweet_train['clean_text'].values)

CountVectorizer(ngram_range=(1, 2))

In [29]:
X_train_bigram = bigram_vectorizer.transform(tweet_train['clean_text'].values)

#### Bigram Tf-Idf

In [30]:
bigram_tf_idf_transformer = TfidfTransformer()
bigram_tf_idf_transformer.fit(X_train_bigram)

TfidfTransformer()

In [31]:
X_train_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_train_bigram)

In [32]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
import numpy as np

In [33]:
best_model = ""
best_model_name = ""
best_score = 0

def train_and_show_scores(X: csr_matrix, y: np.array, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y,train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)

    global_vars = globals()
    if(valid_score > global_vars['best_score']):
        global_vars['best_model'] = clf
        global_vars['best_model_name'] = title
        global_vars['best_score'] = valid_score

    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

In [34]:
y_train = tweet_train['sentiment'].values
y_train

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [35]:
train_and_show_scores(X_train_unigram, y_train, 'Unigram Counts')
train_and_show_scores(X_train_unigram_tf_idf, y_train, 'Unigram Tf-Idf')
train_and_show_scores(X_train_bigram, y_train, 'Bigram Counts')
train_and_show_scores(X_train_bigram_tf_idf, y_train, 'Bigram Tf-Idf')

Unigram Counts
Train score: 1.0 ; Validation score: 0.98

Unigram Tf-Idf
Train score: 1.0 ; Validation score: 0.97

Bigram Counts
Train score: 1.0 ; Validation score: 0.98

Bigram Tf-Idf
Train score: 1.0 ; Validation score: 0.97



In [36]:
print(f'The best Model is {best_model_name} with a Validation score of: {round(best_score, 2)}')

The best Model is Unigram Counts with a Validation score of: 0.98


Testing

In [41]:
tweet_test

# def run_test_using_model(best_model: SGDClassifier, model_type: str):
#     unigram_vectorizer = CountVectorizer(ngram_range=(1, 1))
#     unigram_vectorizer.fit(tweet_test['clean_text'].values)
#     X_test_unigram = unigram_vectorizer.transform(tweet_test['clean_text'].values)

#     bigram_vectorizer = CountVectorizer(ngram_range=(1, 2))
#     bigram_vectorizer.fit(tweet_test['clean_text'].values)
#     X_test_bigram = bigram_vectorizer.transform(tweet_test['clean_text'].values)

#     y_test = tweet_test['sentiment'].values

#     if(model_type == "Unigram Counts"):
#         X_test = X_test_unigram

#     elif(model_type == "Unigram Tf-Idf"):
#         unigram_tf_idf_transformer = TfidfTransformer()
#         unigram_tf_idf_transformer.fit(X_test_unigram)
#         X_test_unigram_tf_idf = unigram_tf_idf_transformer.transform(X_test_unigram)

#         X_test = X_test_unigram_tf_idf

#     elif(model_type == "Bigram Counts"):
#         X_test = X_test_bigram

#     else:
#         bigram_tf_idf_transformer = TfidfTransformer()
#         bigram_tf_idf_transformer.fit(X_test_bigram)

#         X_test_bigram_tf_idf = bigram_tf_idf_transformer.transform(X_test_bigram)
#         X_test = X_test_bigram_tf_idf

#     score = best_model.score(X_test, y_test)

#     return score


Unnamed: 0,sentiment,lang,hashtags,clean_text
2641,1,en,,rt telglobalhealth africa is in the midst of a...
2642,1,en,,rt telglobalhealth africa is in the midst of a...
2643,1,en,covid19,rt williamyang breaking moderna covid vaccines...
2644,1,en,covid19,rt williamyang breaking moderna covid vaccines...
2645,1,en,covid19,rt williamyang breaking moderna covid vaccines...
...,...,...,...,...
3295,1,en,,rt globalhlthtwit dr moeti is head of who in a...
3296,1,en,covid19,rt himantabiswa central govt will provide covi...
3297,1,en,covid19,rt livingstone s how have children experienced...
3298,1,en,,rt moetitshidi africa is in the midst of a ful...


In [38]:
run_test_using_model(best_model, best_model_name)

ValueError: X has 1420 features per sample; expecting 2952

<h3>Topic Modeling</h3>