In [149]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [150]:
df_tweets = pd.read_csv("../Data/cleaned_tweets.csv")
df_tweets = df_tweets[["tweet","sentiment"]]

# define a function to add label
def row_label(row):
    return 1 if row["sentiment"]=="Positive" else 0

# def remove_brakets(row):
#     return row["tweet"].replace(']','').replace
def remove_brakets(row):
    string= re.sub('\[|\]|\'|\.|"|,', '', row["tweet"])
    #string = string.replace(",","")
    return string

df_tweets["label"] = df_tweets.apply(row_label, axis=1)
df_tweets["tweet"] = df_tweets.apply(remove_brakets, axis=1)
df_tweets.drop("sentiment",axis=1, inplace=True)

df_tweets.drop_duplicates()
#reduce the data
df_tweets_inp = df_tweets.sample(frac=0.9, random_state=42)
df_test = df_tweets.drop(df_tweets_inp.index)
df_test = df_test.sample(frac=0.1, random_state=42)

#seperate labelled and unlabelled data
df_labeled_data = df_tweets_inp.sample(frac=0.1, random_state=42)
df_unlabeled_data = df_tweets_inp.drop(df_labeled_data.index)

#print(df_test[df_test["label"]==1].count,df_test[df_test["label"]==0].count)
#print(df_labeled_data[df_labeled_data["label"]==1].count,df_labeled_data[df_labeled_data["label"]==0].count)



df_tweets.head(10)

Unnamed: 0,tweet,label
0,im get borderland murder,1
1,come border kill,1
2,im get borderland kill,1
3,im come borderland murder,1
4,im get borderland 2 murder,1
5,im get borderland murder,1
6,spent hour make someth fun know huge fan ma...,1
7,spent coupl hour someth fun know im huge borde...,1
8,spent hour someth fun know im huge borderland ...,1
9,spent hour make someth fun know huge rhandl...,1


In [151]:
# convert text data into numerical feature vectors using TF-IDF vectorization
vectorizer = TfidfVectorizer()
labeled_features = vectorizer.fit_transform(df_labeled_data['tweet'])
test_features = vectorizer.transform(df_test['tweet'])

In [152]:
# train a logistic regression model on the labeled data
model = LogisticRegression()
model.fit(labeled_features, df_labeled_data['label'])

LogisticRegression()

In [153]:
# evaluate the performance of the model on the test data
accuracy = model.score(test_features, df_test['label'])
print("Accuracy Supervised:", accuracy)

Accuracy Supervised: 0.8023255813953488


In [154]:
# predict the sentiment of the unlabeled data
unlabeled_features = vectorizer.transform(df_unlabeled_data['tweet'])
predicted_sentiments = model.predict(unlabeled_features)

In [155]:
# select the most confident predictions and add them to the labeled data
probabilities = model.predict_proba(unlabeled_features)
confidence = np.max(probabilities, axis=1)
threshold = np.percentile(confidence, 80)
indices = np.where(confidence >= threshold)[0]
df_labeled_data = df_labeled_data.append(df_unlabeled_data.iloc[indices])
labeled_features = vectorizer.transform(df_labeled_data['tweet'])

In [156]:
# retrain the model on the updated labeled data
model.fit(labeled_features, df_labeled_data['label'])

# evaluate the performance of the model on the test data
accuracy = model.score(test_features, df_test['label'])
print("Accuracy: SemiSupervised", accuracy)

Accuracy: SemiSupervised 0.8116279069767441
