<a href="https://colab.research.google.com/github/Devansh1534/DSA-mentor-assignment-1/blob/main/speech_analyzer_optimized.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Increase recursion limit
import sys
sys.setrecursionlimit(10000)

# Read data
df = pd.read_csv("twitter_data.csv")

# Map labels
df['labels'] = df['class'].map({0: "Hate speech detected", 1: "Offensive language detected", 2: "No hate and offensive speech"})

# Select relevant columns
df = df[['tweet', 'labels']]

# Clean text
def clean(text):
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ') if word not in stopword]
    text = " ".join(text)
    return text

df["tweet"] = df["tweet"].apply(clean)

# Remove NaN values
df = df.dropna(subset=["tweet", "labels"])

# Fill NaN with 0
df = df.fillna(0)

# Word2Vec
tokenized_tweets = df['tweet'].apply(word_tokenize)
word2vec_model = Word2Vec(sentences=tokenized_tweets, vector_size=100, window=5, min_count=1, workers=4)
word2vec_features = tokenized_tweets.apply(
    lambda x: average_word_vectors(x, word2vec_model, word2vec_model.wv.index_to_key, 100)
)
word2vec_features = np.vstack(word2vec_features)
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(word2vec_features, df["labels"], test_size=0.33, random_state=42)
w2v_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
w2v_classifier.fit(X_train_w2v, y_train_w2v)

# CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(df["tweet"])
X_train, X_test, y_train, y_test = train_test_split(x, df["labels"], test_size=0.33, random_state=42)

# Ensemble Model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
svm_classifier = SVC(probability=True)
logreg_classifier = LogisticRegression()
ensemble_model = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('svm', svm_classifier),
    ('logreg', logreg_classifier)
], voting='soft')
ensemble_model.fit(X_train, y_train)

# Decision Tree
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)

# XGBoost
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
xgb_classifier = XGBClassifier()
xgb_classifier.fit(X_train, y_train_encoded)

# Function to predict offensiveness
def predict_offensiveness_optimized(text):
    cleaned_text = clean(text)
    test_data_transformed = cv.transform([cleaned_text])

    # Ensemble Model Prediction
    ensemble_prediction = ensemble_model.predict(test_data_transformed)

    # Decision Tree Prediction
    tree_prediction = clf.predict(test_data_transformed)

    # XGBoost Prediction
    xgb_prediction_encoded = xgb_classifier.predict(test_data_transformed)
    xgb_prediction_label = label_encoder.inverse_transform(xgb_prediction_encoded)

    return {
        'ensemble_prediction': ensemble_prediction[0],
        'tree_prediction': tree_prediction[0],
        'xgb_prediction': xgb_prediction_label[0]
    }

# Test examples
text1 = "I will kill you"
text2 = "She is a good girl"

result1 = predict_offensiveness_optimized(text1)
result2 = predict_offensiveness_optimized(text2)

print(f"Text: {text1}\nPrediction: {result1}")
print(f"Text: {text2}\nPrediction: {result2}")

# Visualize Decision Tree
plt.figure(figsize=(20, 10))
plot_tree(clf, filled=True, feature_names=cv.get_feature_names_out(), class_names=df['labels'].unique())
plt.show()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


ParserError: ignored