In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing
import os #File location
from wordcloud import WordCloud #Word visualization
import matplotlib.pyplot as plt #Plotting properties
import seaborn as sns #Plotting properties
from sklearn.feature_extraction.text import CountVectorizer #Data transformation
from sklearn.model_selection import train_test_split #Data testing
from sklearn.linear_model import LogisticRegression #Prediction Model
from sklearn.metrics import accuracy_score #Comparison between real and predicted
from sklearn.preprocessing import LabelEncoder #Variable encoding and decoding for XGBoost
import re #Regular expressions
import nltk
from nltk import word_tokenize
nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'wordcloud'

In [None]:
train_data = pd.read_csv(r"D:\Ultimate Programming\Data Bases\Project Dataset\Twitter\twitter_training.csv")
val_data = pd.read_csv(r"D:\Ultimate Programming\Data Bases\Project Dataset\Twitter\twitter_validation.csv")
col = ['Tweet Id', 'Entity','Sentiment','Tweet Content']
train_data.columns = col
val_data.columns = col

In [None]:
train_data.head(2)

In [None]:
val_data.head(2)

# Data Cleaning

In [None]:
train_data.shape

In [None]:
train_data.dropna(inplace=True)

In [None]:
print(train_data.isnull().sum().sum())

In [None]:
train_data.duplicated().sum()

In [None]:
train_data.drop_duplicates(inplace=True)
print(train_data.shape)

In [None]:
train_data.info()

In [None]:
train_data.head(4)

# Exploratory Data Analysis (EDA)

In [None]:
plt.figure(figsize=(10, 5))
sns.countplot(data=train_data, x='Sentiment', palette="Blues")
plt.title('Sentiment Distribution')
plt.show()

In [None]:
train_data["lower"]=train_data["Tweet Content"].astype(str).str.lower() #lowercase
train_data["lower"]=[str(data) for data in train_data.lower] #converting all to string
train_data["lower"]=train_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) #regex

In [None]:
val_data["lower"]=val_data["Tweet Content"].astype(str).str.lower() 
val_data["lower"]=[str(data) for data in val_data.lower] 
val_data["lower"]=val_data.lower.apply(lambda x: re.sub('[^A-Za-z0-9 ]+', ' ', x)) 

In [None]:
train_data.head(2)

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
neg_wc = wc.generate(train_data[train_data['Sentiment'] == "Negative"]['lower'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(neg_wc)
plt.show()

In [None]:
neutral_wc = wc.generate(train_data[train_data['Sentiment'] == "Neutral"]['lower'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(neutral_wc)
plt.show()

In [None]:
pos_wc = wc.generate(train_data[train_data['Sentiment'] == "Positive"]['lower'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(pos_wc)
plt.show()

In [None]:
irr_wc = wc.generate(train_data[train_data['Sentiment'] == "Irrelevant"]['lower'].str.cat(sep=" "))
plt.figure(figsize=(15,6))
plt.imshow(irr_wc)
plt.show()

In [None]:
plot1=train_data.groupby(by=["Entity","Sentiment"]).count().reset_index()
plot1.head()

In [None]:
#Text splitting
tokens_text = [word_tokenize(str(word)) for word in train_data.lower]
#Unique word counter
tokens_counter = [item for sublist in tokens_text for item in sublist]
print("Number of tokens: ", len(set(tokens_counter)))

In [None]:
print(tokens_text[1], end = " ")

In [None]:
#Choosing english stopwords
stopwords_nltk = nltk.corpus.stopwords
stop_words = stopwords_nltk.words('english')
stop_words[:6]

In [None]:
#Initial Bag of Words
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    stop_words=stop_words, 
    ngram_range=(1, 1) 
)

In [None]:
reviews_train, reviews_test = train_test_split(train_data, test_size=0.2, random_state=0)

In [None]:
#Creation of encoding related to train dataset
X_train_bow = bow_counts.fit_transform(reviews_train.lower)
#Transformation of test dataset with train encoding
X_test_bow = bow_counts.transform(reviews_test.lower)

In [None]:
#Labels for train and test encoding
y_train_bow = reviews_train['Sentiment']
y_test_bow = reviews_test['Sentiment']

In [None]:
#n-gram of 4 words
bow_counts = CountVectorizer(
    tokenizer=word_tokenize,
    ngram_range=(1,4)
)

X_train_bow = bow_counts.fit_transform(reviews_train.lower)
X_test_bow = bow_counts.transform(reviews_test.lower)
X_val_bow = bow_counts.transform(val_data.lower)

In [None]:
model2 = LogisticRegression(C=0.9, solver="liblinear",max_iter=1500)
model2.fit(X_train_bow, y_train_bow)
test_pred_2 = model2.predict(X_test_bow)
print("Accuracy: ", accuracy_score(y_test_bow, test_pred_2) * 100)

In [None]:
y_val_bow = val_data['Sentiment']
Val_pred_2 = model2.predict(X_val_bow)
print("Accuracy: ", accuracy_score(y_val_bow, Val_pred_2) * 100)

In [None]:
train_data.head(2)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [None]:
text_column = 'lower'  

preprocessor = ColumnTransformer(
    transformers=[
        ('text', CountVectorizer(tokenizer=word_tokenize, ngram_range=(1,4)), text_column)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('clf', LogisticRegression(C=0.9, solver="liblinear",max_iter=1500))
])

pipeline.fit(train_data, train_data['Sentiment'])

In [None]:

pred_val = pipeline.predict(val_data)


import joblib
joblib.dump(pipeline, 'model.pkl')