In [None]:
!pip install kaggle



In [None]:
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
 90% 73.0M/80.9M [00:00<00:00, 129MB/s]
100% 80.9M/80.9M [00:00<00:00, 118MB/s]


In [None]:
!unzip sentiment140.zip

Archive:  sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [None]:
# Load necessary libraries
import pandas as pd
import re
import string
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

# Load dataset (Twitter Sentiment Analysis)
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',
                 encoding='ISO-8859-1',
                 names=['target', 'ids', 'date', 'flag', 'user', 'text'])

# Filter out the columns you need
df = df[['target', 'text']]

# Map target values: 0 for negative, 4 for positive (convert to 0, 1)
df['target'] = df['target'].map({0: 'negative', 4: 'positive'})

# Clean the text data (same as original cleaning process)
stop = set(stopwords.words('english'))
wl = WordNetLemmatizer()

def preprocess_text(text):
    soup = BeautifulSoup(text, "html.parser")  # Remove HTML tags
    text = soup.get_text()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    text = " ".join([wl.lemmatize(word) for word in text.split() if word not in stop and word.isalpha()])
    return text

df['text'] = df['text'].apply(preprocess_text)

# Check the cleaned data
print(df.head())

# Proceed with data analysis


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['target'].map({0: 'negative', 4: 'positive'})
  soup = BeautifulSoup(text, "html.parser")  # Remove HTML tags


     target                                               text
0  negative  switchfoot awww thats bummer shoulda got david...
1  negative  upset cant update facebook texting might cry r...
2  negative  kenichan dived many time ball managed save res...
3  negative                    whole body feel itchy like fire
4  negative           nationwideclass behaving im mad cant see


In [None]:
# Split the dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

# Encode labels
label_encode = LabelEncoder()
y_data = label_encode.fit_transform(df['target'])

# Split data into train and test
x_train, x_test, y_train, y_test = train_test_split(df['text'], y_data, test_size=0.2, random_state=42)

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
x_train_encoded = tfidf_vectorizer.fit_transform(x_train)
x_test_encoded = tfidf_vectorizer.transform(x_test)


In [None]:
# Train Decision Tree model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt_classifier = DecisionTreeClassifier(criterion='entropy', random_state=42)
dt_classifier.fit(x_train_encoded, y_train)
y_pred = dt_classifier.predict(x_test_encoded)
print(f'Decision Tree Accuracy: {accuracy_score(y_pred, y_test)}')

Decision Tree Accuracy: 0.714921875


In [None]:
# Train Random Forest model
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(x_train_encoded, y_train)
y_pred_rf = rf_classifier.predict(x_test_encoded)
print(f'Random Forest Accuracy: {accuracy_score(y_pred_rf, y_test)}')