In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
import re
import nltk
from nltk.util import pr
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

# Download the 'stopwords' corpus and 'wordnet' corpus for lemmatization
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

# Define a set of English stopwords
stopword = set(stopwords.words("english"))

In [None]:
file_id = '1AXZ75jdfYlxBKTqkGh3qKsb4dJ-VMXJN'

url = f'https://drive.google.com/uc?export=download&id={file_id}'
df= pd.read_csv(url)
df.head()

In [None]:
'''
f_id = '1jbTLFrG-DfatEiXkscFQ_ZWupmsK6nlM5avhHFeEsIM'

# URL to access the CSV export of the Google Sheets
url = f'https://docs.google.com/spreadsheets/d/{f_id}/export?format=csv&id={f_id}'
df= pd.read_csv(url)
df.head()
'''

In [None]:
# Check the available columns in your DataFrame
print(df.columns)

# Replace 'comment_text' with the actual name of the column containing the comments
df = df[['text','Race',	'Religion',	'Sexuality',	'age']]

In [None]:
# Download the 'punkt' resource
import nltk
nltk.download('punkt')

def clean(text):
    text=str(text).lower()
    text=re.sub('\[.*?\]','',text)
    text=re.sub('https?://\S+|www\.\S+','',text)
    text=re.sub('<.*?>+','',text)
    text=re.sub('[%s]' % re.escape(string.punctuation),'',text)
    text=re.sub('\n',' ',text)
    text=re.sub('\r','',text)
    text=re.sub('\w*\d\w*','',text)
    words = nltk.word_tokenize(text) # Now this should work
    text= [lemmatizer.lemmatize(word) for word in words if word.lower() not in stopword]
    texts=" ".join(text)
    return texts

# Apply the 'clean' function to the 'text' column (not 'texts')
df['text'] = df['text'].apply(clean)
print(df.head())

In [None]:
'''
df['text'] = df['text'].apply(clean)

# Vectorize the text data
cv = CountVectorizer()
X = cv.fit_transform(df['text'])

# Now use the vectorized data (X) for train_test_split
y= np.array(df[['target_race','target_religion','target_origin','target_gender','target_sexuality']])
X_train,X_test,y_train,y_test= train_test_split(X, y,test_size=0.8,random_state=1)

rf = RandomForestClassifier()
rf.fit(X_train,y_train)
'''

In [None]:
#!pip install --upgrade scikit-learn==1.4.2
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
!pip show scikit-learn

In [None]:
# Assuming your dataset is in a pandas DataFrame named 'df'
# Data Preprocessing
# Combine all target columns into a single column
# Verify the actual column names in your DataFrame
# and replace the placeholders below with the correct names
df['label'] = df[['Race',	'Religion',	'Sexuality',	'age']].values.tolist()
df['label'] = df['label'].apply(lambda x: [int(i) for i in x])
# Split the data into features and labels
X = df['text']
y = np.array(df['label'].tolist())
# Vectorize the text data
vectorizer = TfidfVectorizer(max_features=10000)
X_vec = vectorizer.fit_transform(X)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

In [12]:
# Build the Decision Tree model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict the test data
y_pred_rf = rf_model.predict(X_test)


KeyboardInterrupt: 

In [None]:
 # Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=['Race',	'Religion',	'Sexuality',	'age'],zero_division=0))

In [None]:
def classify_text(model, vectorizer, text):
    # Vectorize the input text
    text_vec = vectorizer.transform([text])
    # Predict the class
    prediction = model.predict(text_vec)
    # Map the prediction to labels
    labels = ['Race',	'Religion',	'Sexuality',	'age']
    result = {labels[i]: prediction[0][i] for i in range(len(labels))}
    return result

# Interactive prompt for user input
while True:
    new_text = input("Enter text to classify (or 'exit' to quit): ")
    if new_text.lower() == 'exit':
        break
    classification_result = classify_text(rf_model, vectorizer, new_text)
    print("Classification Result:", classification_result)

In [None]:
from google.colab import drive
import pickle

# Mount Google Drive
drive.mount('/content/drive')

# Save the model to a file
#joblib.dump(clf, '/content/drive/My Drive/Colab Notebooks/NLP/New/Very new/Decision Tree Hate Spech Classification.pkl')
with open('/content/drive/My Drive/Nlp Project/Random Forest Hate Spech Classification.pkl', 'wb') as f:
    pickle.dump(rf_model, f)
with open('/content/drive/My Drive/Nlp Project/RandomForest_vectorizer.pkl', 'wb') as f:
    pickle.dump((rf_model,vectorizer),f)