In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [2]:
data = pd.read_csv('FinalBalancedDataset.csv',nrows=50000)

In [3]:
data=data.drop("Unnamed: 0",axis=1)

In [4]:
data.head(5)

Unnamed: 0,Toxicity,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [5]:
X = data['tweet']
y = data['Toxicity']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # You can adjust the max_features parameter
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [8]:
lr_model = LogisticRegression()
nb_model = MultinomialNB()
svm_model = SVC(kernel='linear', probability=True)
rf_model = RandomForestClassifier()

In [9]:
voting_classifier = VotingClassifier(estimators=[
    ('lr', lr_model),
    ('nb', nb_model),
    ('svm', svm_model),
    ('rf', rf_model)
], voting='soft')

In [10]:
voting_classifier.fit(X_train_tfidf, y_train)

In [11]:
y_pred = voting_classifier.predict(X_test_tfidf)

In [12]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.9464
Precision: 0.952962191128312
Recall: 0.8943839061190276
F1 Score: 0.922744306716633


In [15]:
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


['tfidf_vectorizer.pkl']

In [13]:
import joblib

In [14]:
joblib.dump(voting_classifier, 'voting_classifier_model.pkl')

['voting_classifier_model.pkl']

In [None]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the saved model
loaded_model = joblib.load('voting_classifier_model.pkl')

# Load the TF-IDF vectorizer
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Take input from the user
user_input = input("Enter your tweet: ")

# Fit the TF-IDF vectorizer with training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Preprocess the input text
user_input_tfidf = tfidf_vectorizer.transform([user_input])


# Use the loaded model to make predictions
prediction = loaded_model.predict(user_input_tfidf)

# Output the prediction result

if prediction == 1:
    print("The comment is toxic.")
else:
    print("The comment is not toxic.")
