# Sentiment Analyisis ML for Techwise Cohort 3 Final Project (WebDev)
# ML Model Possibilities:
*   Linear Regression
*   Random Forest
*   SVM
*   XGBoost / LightGBM
*   LSTM / GRU (RNN)
*   BERT / RoBERTa
*   Naive Bayes
*   VADER(Maybe)
---
But First, Need to download, clean, and analyze the dataset.
We are using this dataset (https://www.kaggle.com/datasets/kazanova/sentiment140).
This dataset hcontainsas 1.6 million tweets from X.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
# file_path = '/content/drive/My Drive/training.1600000.processed.noemoticon.csv'


In [None]:
import pandas as pd

# Define column names based on dataset documentation
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']

# Load the file
df = pd.read_csv(file_path, encoding='ISO-8859-1', names=columns)

# Preview it
df.head()


Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


NICE WE GOT THE DATA.

---

We need to clear up any "Zeroes" In this case empty tweets or those with no Target. We also dont need the date, the ids, the flag, or the user. The only two columns we need are target and text.

In [None]:
df.drop(['ids', 'date', 'flag', 'user'], axis=1, inplace=True)
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Snazzy, now we only got text and our target value, but this target value seems fishy simply because the only value ive seen in the head so far is 0. So lets explore that further

In [None]:
df.target.value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


There is only 0 and 4 and it looks to be a direct even split. Which is a little wierd. According to the context provided with the dataset we have 800k Positive and 800k Negative Which is not ideal for me. Alas we will move on from here and give it a go.

---

We gotta do some careful cleaning to remove irelevant info. For example we dont want:


*   LowerCase Items
*   Remove URLS, mentions, and Hashtags
*   Remove Punctuation
*   Remove Stopwords




In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#\w+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"\d+", '', text)
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

df['text'] = df['text'].apply(clean_text)
df['target'] = df['target'].apply(lambda x: 1 if x == 4 else 0)  # 1 = pos, 0 = neg

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Now, ML algorithms can not directly work with raw tex. So we need to create a numerical representation of the word or sentence.
We also need to tokenize the worse so that they are split into meaningful units.
And Once all that is done we need to take those tokens turn them into numbers and make them vectors of important words with numeric representation.

For this we will use Term Frequency-Inverse Document Frequency (TF-IDF)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])
y = df['target']


In [None]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   target  1600000 non-null  int64 
 1   text    1600000 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


Unnamed: 0,target,text
0,0,that bummer shoulda got david carr third day
1,0,upset cant updat facebook text might cri resul...
2,0,dive mani time ball manag save rest go bound
3,0,whole bodi feel itchi like fire
4,0,behav im mad cant see


# Linear regression model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Load your preprocessed data
# Assume df has columns 'text' (string) and 'target' (numeric or binary sentiment)
# If 'target' is 0 or 1, it still works, but logistic regression is better for classification

# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])

# Define target variable
y = df['target']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")


Mean Squared Error: 0.1636
R² Score: 0.3456


In [None]:
from sklearn.metrics import accuracy_score

# Threshold the continuous predictions to 0 or 1
y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_binary)

print(f"Accuracy: {accuracy:.4f}")


Accuracy: 0.7683


 # Random Forest Model

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 1: Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])
y = df['target']  # Assumes values are 0 and 1

# Step 2: Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Step 3: Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = rf_model.predict(X_test)


In [None]:
# Step 5: Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Optional: Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
# Optional: Detailed report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Testing
Using text not from the training or test set.

In [None]:
import numpy as np

# Function to clean new input the same way you cleaned the training data
def clean_input(text):
    import re
    from nltk.stem import PorterStemmer
    from nltk.corpus import stopwords
    stemmer = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r"@\w+|#\w+", '', text)
    text = re.sub(r"[^\w\s]", '', text)
    text = re.sub(r"\d+", '', text)
    words = text.split()
    words = [stemmer.stem(w) for w in words if w not in stop_words]
    return " ".join(words)

# Function to predict sentiment
def predict_sentiment(text):
    cleaned = clean_input(text)
    vect_text = vectorizer.transform([cleaned])

    # Get predicted class: 0 or 1
    pred_class = rf_model.predict(vect_text)[0]

    # Get prediction confidence (probability of positive class)
    prob = rf_model.predict_proba(vect_text)[0][1]  # Probability of positive (class 1)

    # Scale the prediction from 0 to 4
    scaled_sentiment = int(round(prob * 4))

    print(f"\nInput: {text}")
    print(f"Cleaned: {cleaned}")
    print(f"Predicted sentiment score (0 to 4): {scaled_sentiment}")
    print(f"Binary classification: {'Positive' if pred_class == 1 else 'Negative'} (class {4 if pred_class == 1 else 0})")


In [None]:
# predict_sentiment("I absolutely love this!")
# predict_sentiment("This is the worst day ever.")