In [None]:
import re
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# Clean raw tweets
def clean_text(text):
    text = ' '.join([i for i in text.split() if '@' not in i])  # remove emails
    text = re.sub('http[s]?:?/\s+', '', text)                  # remove URLs
    text = re.sub(r'[^a-zA-Z\']', ' ', text)                   # keep only alphabets
    text = re.sub(r'[^\x00-\x7F]+', '', text)                  # remove Unicode chars
    text = text.lower()                                        # lowercase
    text = re.sub('\s+', ' ', text)                            # remove extra spaces
    return text

nltk.download('stopwords')
nltk.download('vader_lexicon')

# Load English stop words
STOP_WORDS = set(stopwords.words('english'))

# Load dataset
df = pd.read_csv("train.csv")  # columns: id, label, tweet
df["clean_tweet"] = df["tweet"].apply(clean_text)
df["cleaned_tweet"] = df["clean_tweet"].apply(lambda x: ' '.join([word for word in x.split() if word not in STOP_WORDS]))
df["word_count"] = df["cleaned_tweet"].str.split().apply(len)

# Add polarity score
sid = SentimentIntensityAnalyzer()
df["scores"] = df["tweet"].apply(lambda t: sid.polarity_scores(t))
df["compound"] = df["scores"].apply(lambda s: s['compound'])
df["comp_score"] = df["compound"].apply(lambda s: '0' if s <= 0 else '1')

# Feature extraction
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df["cleaned_tweet"])
y = df["label"]

# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred) * 100, "%")