## Import the libraries

In [3]:
import string
import pandas as pd
import re
import joblib
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


## Loading the Dataset

In [4]:
HEADERS = ["sentiment", "id", "date", "query", "user", "text"]

df = pd.read_csv("sentiment140.csv", encoding="ISO-8859-1", names=HEADERS)

display(df.head())

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


# Cleaning the Dataset

In [None]:
df.drop(["id", "date", "query", "user"], axis=1, inplace=True)

# Investigate the Dataset

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   sentiment  1600000 non-null  int64 
 1   text       1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


# Handling Duplicated Values

In [None]:
df.duplicated().sum()

16309

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.shape

(1583691, 2)

## Text Cleaning

In [None]:
STOP_WORDS = set(stopwords.words("english"))

LEMMATIZER = WordNetLemmatizer()


def clean_mentions(text: str):
    return re.sub(r"@[a-zA-Z0-9_]+", "", text)


def clean_URLs(text: str):
    return re.sub(r"http\S+|www\S+|https\S+", "", text)


def clean_HTML5_entities(text: str):
    return re.sub(r"&[a-z]+;", "", text)


def clean_punctuations(text: str):
    return re.sub(r"\W", " ", text)


def clean_numbers(text: str):
    return re.sub(r"[0-9]+", "", text)


def clean_stopwords(text: str):
    return " ".join([token for token in text.split() if token not in STOP_WORDS])


def lemmatize_text(text: str):
    return " ".join([LEMMATIZER.lemmatize(token, pos="v") for token in text.split()])


def normalize(text: str):
    text = text.lower()
    text = clean_mentions(text)
    text = clean_URLs(text)
    text = clean_HTML5_entities(text)
    text = clean_punctuations(text)
    text = clean_numbers(text)
    text = clean_stopwords(text)
    text = lemmatize_text(text)
    return text

In [None]:
df["text"] = df["text"].apply(str.lower)
df['text'] = df['text'].apply(clean_URLs)
df['text'] = df['text'].apply(clean_HTML5_entities)
df['text'] = df['text'].apply(clean_numbers)
df['text'] = df['text'].apply(clean_stopwords)
df['text'] = df['text'].apply(clean_punctuations)
df["text"] = df["text"].apply(lemmatize_text)
display(df['text'])

0          switchfoot that s bummer shoulda get david car...
1          upset can t update facebook texting it might c...
2          kenichan dive many time ball manage save rest ...
3                            whole body feel itchy like fire
4          nationwideclass no behave all i m mad here can...
                                 ...                        
1599995                        wake up school best feel ever
1599996            thewdb com cool hear old walt interview â
1599997                       ready mojo makeover ask detail
1599998    happy th birthday boo alll time tupac amaru sh...
1599999    happy charitytuesday thenspcc sparkscharity sp...
Name: text, Length: 1583691, dtype: object

## Vectorize The Text

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(df['text'])

## Spliting the data

In [None]:
x_train, x_test, y_train, y_test = train_test_split(tfidf_features, df['sentiment'], test_size=0.2, random_state=42)

## Model Training

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

## Model Accuracy

In [None]:
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.7674931094686792


## Process User Input and Predict

In [None]:
def preprocess_input(text):
    cleaned_text = normalize(text)
    transformed_text = tfidf_vectorizer.transform([cleaned_text]).toarray()
    return transformed_text

def predict_sentiment(text):
    transformed_text = preprocess_input(text)
    prediction = model.predict(transformed_text)
    sentiment_mapping = {0: 'Negative', 2: 'Neutral', 4: 'Positive'}
    sentiment = sentiment_mapping[prediction[0]]
    return sentiment

## Test with User Input

In [None]:
text="my name is wafaa samy "
sentiment = predict_sentiment(text)
print(f"Text: '{text}'")
print(f"Predicted Sentiment: {sentiment}\n")

Text: 'my name is wafaa samy '
Predicted Sentiment: Positive



# Model Persistence

In [None]:
joblib.dump(model, "sentiment_model.joblib")
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.joblib")

['tfidf_vectorizer.joblib']