In [1]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('/content/twitter_training.csv')

# Display the first few rows
df.head()


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [2]:
# Rename the columns
df.columns = ['ID', 'Topic', 'Sentiment', 'Text']

# Display the first few rows to confirm the changes
df.head()


Unnamed: 0,ID,Topic,Sentiment,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
import re
# Function to clean text by removing URLs and special characters

def clean_text(text):
    # Convert non-string values (like NaN or floats) to empty strings
    if not isinstance(text, str):
        return ""

    # Remove URLs (patterns like http:// or https:// followed by characters)
    text = re.sub(r'http\S+|www\S+', '', text)

    # Remove special characters, keeping only letters, numbers, and spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Remove extra spaces
    text = text.strip()

    return text

# Apply the cleaning function to the text column
df['Text'] = df['Text'].astype(str).apply(clean_text)  # Replace 'text_column' with the actual column name
df.head()

Unnamed: 0,ID,Topic,Sentiment,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [11]:
!pip install spacy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [13]:
# Limit the dataset to 10,000 entries
df = df.head(10000)

In [15]:
!pip install tqdm



In [16]:
import spacy
from tqdm import tqdm

# Enable tqdm progress bar for Pandas
tqdm.pandas()

# Load English tokenizer model
nlp = spacy.load("en_core_web_sm")

# Tokenize the 'Text' column with a progress bar
df['tokenized_text'] = df['Text'].progress_apply(lambda x: [token.text for token in nlp(str(x))])

# Display the first few rows to confirm
df.head()


100%|██████████| 10000/10000 [01:54<00:00, 87.31it/s]


Unnamed: 0,ID,Topic,Sentiment,Text,tokenized_text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[I, am, coming, to, the, borders, and, I, will..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,"[i, m, getting, on, borderlands, and, i, will,..."
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[i, m, coming, on, borderlands, and, i, will, ..."
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[i, m, getting, on, borderlands, 2, and, i, wi..."
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[i, m, getting, into, borderlands, and, i, can..."


In [18]:
from tqdm import tqdm
import spacy

# Enable tqdm progress bar for Pandas
tqdm.pandas()

# Load English tokenizer and lemmatizer model
nlp = spacy.load("en_core_web_sm")

# Function to lemmatize text
def lemmatize_text(text):
    # Process the text with spaCy
    doc = nlp(str(text))  # Convert to string to handle NaN values
    # Extract lemmatized words
    return [token.lemma_ for token in doc]

# Apply lemmatization with a progress bar
df['lemmatized_text'] = df['Text'].progress_apply(lemmatize_text)

# Display the first few rows
df.head()


100%|██████████| 10000/10000 [01:48<00:00, 92.16it/s]


Unnamed: 0,ID,Topic,Sentiment,Text,tokenized_text,lemmatized_text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,"[I, am, coming, to, the, borders, and, I, will...","[I, be, come, to, the, border, and, I, will, k..."
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,"[i, m, getting, on, borderlands, and, i, will,...","[I, m, get, on, borderland, and, I, will, kill..."
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[i, m, coming, on, borderlands, and, i, will, ...","[I, m, come, on, borderland, and, I, will, mur..."
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,"[i, m, getting, on, borderlands, 2, and, i, wi...","[I, m, get, on, borderland, 2, and, I, will, m..."
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[i, m, getting, into, borderlands, and, i, can...","[I, m, get, into, borderland, and, I, can, mur..."


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# Initialize CountVectorizer to create BoW representation
vectorizer = CountVectorizer()

# Convert lemmatized tokens back into sentences with a progress bar
df['lemmatized_text_str'] = df['lemmatized_text'].progress_apply(lambda x: ' '.join(x))

# Fit and transform the text to create a document-term matrix with a progress bar
bow_matrix = vectorizer.fit_transform(tqdm(df['lemmatized_text_str'], desc="Generating BoW Matrix"))

# Convert the sparse matrix into a Pandas DataFrame for better visualization
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

bow_df

100%|██████████| 10000/10000 [00:00<00:00, 322966.70it/s]
Generating BoW Matrix: 100%|██████████| 10000/10000 [00:00<00:00, 25546.18it/s]


Unnamed: 0,00,000,001,01,012143est,02,03,03573057,0359873057,04,...,zoom,zork,zulyxw3h0r,zw6i2qikai,zxxxvid,zxxxvidsspace,zxxxvidsspacepphph5b6a24,zxxxvispace,zyfapoihpy,zzgi8xvk7
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

# Enable tqdm for pandas
tqdm.pandas()

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Convert lemmatized tokens back into sentences with a progress bar
df['lemmatized_text_str'] = df['lemmatized_text'].progress_apply(lambda x: ' '.join(x))

# Compute TF-IDF scores with a progress bar
tfidf_matrix = tfidf_vectorizer.fit_transform(tqdm(df['lemmatized_text_str'], desc="Computing TF-IDF"))

# Convert the sparse matrix into a Pandas DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Display the first few rows of the TF-IDF matrix
tfidf_df.head()

100%|██████████| 10000/10000 [00:00<00:00, 288700.87it/s]
Computing TF-IDF: 100%|██████████| 10000/10000 [00:00<00:00, 36383.50it/s]


Unnamed: 0,00,000,001,01,012143est,02,03,03573057,0359873057,04,...,zoom,zork,zulyxw3h0r,zw6i2qikai,zxxxvid,zxxxvidsspace,zxxxvidsspacepphph5b6a24,zxxxvispace,zyfapoihpy,zzgi8xvk7
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Instead of applying PCA, we truncated the dataset to reduce the workload. PCA would be too computationally expensive in our case.

In [25]:
from sklearn.model_selection import train_test_split

# Define features (TF-IDF matrix) and target (Sentiment labels)
X = tfidf_df  # TF-IDF features
y = df['Sentiment']  # Replace with actual sentiment column name

# Split into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display dataset sizes
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (8000, 8973)
Testing set size: (2000, 8973)


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Initialize Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Wrap the training process in tqdm to show a progress bar
with tqdm(total=1, desc="Training Logistic Regression") as pbar:
    log_reg.fit(X_train, y_train)
    pbar.update(1)  # Update the progress bar after training is complete

# Predict on test data with a progress bar
with tqdm(total=1, desc="Predicting on Test Data") as pbar:
    y_pred = log_reg.predict(X_test)
    pbar.update(1)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Logistic Regression Accuracy: {accuracy:.4f}")

# Print detailed classification report
print(classification_report(y_test, y_pred))


Training Logistic Regression: 100%|██████████| 1/1 [00:37<00:00, 37.49s/it]
Predicting on Test Data: 100%|██████████| 1/1 [00:00<00:00,  6.16it/s]


Logistic Regression Accuracy: 0.8360
              precision    recall  f1-score   support

  Irrelevant       0.88      0.75      0.81       371
    Negative       0.85      0.82      0.84       465
     Neutral       0.83      0.84      0.84       521
    Positive       0.81      0.89      0.85       643

    accuracy                           0.84      2000
   macro avg       0.84      0.83      0.83      2000
weighted avg       0.84      0.84      0.84      2000



In [28]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Initialize Decision Tree model
decision_tree = DecisionTreeClassifier(random_state=42)

# Wrap the training process in tqdm to show a progress bar
with tqdm(total=1, desc="Training Decision Tree") as pbar:
    decision_tree.fit(X_train, y_train)
    pbar.update(1)  # Update progress bar after training completes

# Predict on test data with a progress bar
with tqdm(total=1, desc="Predicting on Test Data") as pbar:
    y_pred_dt = decision_tree.predict(X_test)
    pbar.update(1)

# Evaluate model performance
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt:.4f}")

# Print detailed classification report
print(classification_report(y_test, y_pred_dt))


Training Decision Tree: 100%|██████████| 1/1 [00:20<00:00, 20.81s/it]
Predicting on Test Data: 100%|██████████| 1/1 [00:00<00:00,  6.92it/s]


Decision Tree Accuracy: 0.7480
              precision    recall  f1-score   support

  Irrelevant       0.70      0.69      0.70       371
    Negative       0.76      0.71      0.73       465
     Neutral       0.73      0.79      0.76       521
    Positive       0.79      0.78      0.78       643

    accuracy                           0.75      2000
   macro avg       0.74      0.74      0.74      2000
weighted avg       0.75      0.75      0.75      2000



In [29]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

# Initialize SVM classifier
svm_model = SVC(kernel='linear', random_state=42)

# Wrap training process in tqdm to show progress
with tqdm(total=1, desc="Training SVM") as pbar:
    svm_model.fit(X_train, y_train)
    pbar.update(1)  # Update progress bar after training completes

# Predict on test data with a progress bar
with tqdm(total=1, desc="Predicting on Test Data") as pbar:
    y_pred_svm = svm_model.predict(X_test)
    pbar.update(1)

# Evaluate model performance
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"SVM Accuracy: {accuracy_svm:.4f}")

# Print detailed classification report
print(classification_report(y_test, y_pred_svm))


Training SVM: 100%|██████████| 1/1 [06:49<00:00, 409.49s/it]
Predicting on Test Data: 100%|██████████| 1/1 [01:27<00:00, 87.85s/it]

SVM Accuracy: 0.8770
              precision    recall  f1-score   support

  Irrelevant       0.88      0.85      0.86       371
    Negative       0.89      0.87      0.88       465
     Neutral       0.88      0.87      0.87       521
    Positive       0.87      0.91      0.89       643

    accuracy                           0.88      2000
   macro avg       0.88      0.87      0.88      2000
weighted avg       0.88      0.88      0.88      2000




