In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load DataFrame
df = pd.read_csv("C:/Users/durga/Desktop/SPU/DS 600/PROJECT/FINAL/cleaned_data.csv")

# Tokenization and Preprocessing function
def tokenize_and_preprocess(text):
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords and punctuation
    tokens = [word.lower() for word in tokens if word.lower() not in stopwords.words('english') and word.lower() not in string.punctuation]
    # Lemmatize and Stem
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens]
    return tokens

# Apply tokenization and preprocessing to the 'Comment' column
df['Tokenized_Comment'] = df['Comment'].apply(tokenize_and_preprocess)

# Display the preprocessed DataFrame
print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\durga\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\durga\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\durga\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                  Name    Profile ID               Date  \
0         scotty2jatty  3.982741e+08  22/04/24 05:27:52   
1     angelasanders975  6.596329e+10  22/04/24 17:43:49   
2        myleslewis_24  1.168729e+10  22/04/24 17:49:54   
3  matthew_williams224  2.524021e+10  22/04/24 18:53:42   
4      vremyatherapper  3.178600e+09  22/04/24 19:21:58   

                                Comment  Comment_Length  \
0                 better jakepaul dirty            34.0   
1               thats soooooooooo right            24.0   
2                        bro want shirt            21.0   
3  michaelgalt bro needs beat jakes ass            60.0   
4                              mike tko            28.0   

                          Tokenized_Comment  
0                 [better, jakepaul, dirti]  
1                [that, soooooooooo, right]  
2                        [bro, want, shirt]  
3  [michaelgalt, bro, need, beat, jake, as]  
4                               [mike, tko]  


In [4]:
hate = pd.read_csv("C:/Users/durga/Desktop/SPU/DS 600/PROJECT/FINAL/DATA FILES/HATE.csv",encoding='latin1')
hate['word'] = hate['word'].str.lower()

In [9]:
def assign_sentiment(tokenized_comment):
    sentiments = []
    for token in tokenized_comment:
        # Check if token exists in hate dataset
        if token in hate['word'].values:
            sentiment = hate.loc[hate['word'] == token, 'sentiment'].iloc[0]
            sentiments.append(sentiment)
    # If no sentiment found, assume neutral
    if not sentiments:
        sentiments.append('Neutral')
    return sentiments

# Apply tokenization and preprocessing to the 'Comment' column
df['Tokenized_Comment'] = df['Comment'].apply(tokenize_and_preprocess)

# Assign sentiments to tokenized comments
df['sentiment'] = df['Tokenized_Comment'].apply(assign_sentiment)

# Display the preprocessed DataFrame
print(df.head())

                  Name    Profile ID               Date  \
0         scotty2jatty  3.982741e+08  22/04/24 05:27:52   
1     angelasanders975  6.596329e+10  22/04/24 17:43:49   
2        myleslewis_24  1.168729e+10  22/04/24 17:49:54   
3  matthew_williams224  2.524021e+10  22/04/24 18:53:42   
4      vremyatherapper  3.178600e+09  22/04/24 19:21:58   

                                Comment  Comment_Length  \
0                 better jakepaul dirty            34.0   
1               thats soooooooooo right            24.0   
2                        bro want shirt            21.0   
3  michaelgalt bro needs beat jakes ass            60.0   
4                              mike tko            28.0   

                          Tokenized_Comment   sentiment  
0                 [better, jakepaul, dirti]   [Neutral]  
1                [that, soooooooooo, right]   [Neutral]  
2                        [bro, want, shirt]   [Neutral]  
3  [michaelgalt, bro, need, beat, jake, as]  [negative]  


In [10]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to join tokenized words into strings
def join_tokens(tokens):
    return ' '.join(tokens)

# Joining tokenized words into strings
df['Tokenized_Comment_String'] = df['Tokenized_Comment'].apply(join_tokens)

# Creating CountVectorizer to convert tokenized comments into a bag of words representation
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Tokenized_Comment_String'])

# Training LDA model
lda_model = LatentDirichletAllocation(n_components=2, random_state=42)  
lda_model.fit(X)

# Getting the topic-word matrix
topic_word_matrix = lda_model.components_

# Get the feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Define functions to get top words for each topic
def get_top_words(topic_index, n_top_words):
    return [feature_names[i] for i in topic_word_matrix[topic_index].argsort()[:-n_top_words - 1:-1]]

# Get top words for each topic
n_top_words = 10  
top_words_offensive = get_top_words(0, n_top_words)
top_words_non_offensive = get_top_words(1, n_top_words)

print("Top words for offensive topic:")
print(top_words_offensive)
print("\nTop words for non-offensive topic:")
print(top_words_non_offensive)

Top words for offensive topic:
['like', 'one', 'lol', 'get', 'even', 'presid', 'need', 'biden', 'come', 'know']

Top words for non-offensive topic:
['trump', 'biden', 'woman', 'like', 'vote', 'go', 'get', 'presid', 'peopl', 'one']


In [13]:
def preprocess_text(text):
    # Join tokens into a single string
    text = ' '.join(text)
    # Converting to lowercase
    text = text.lower()
    # Removing punctuation and non-alphanumeric characters
    text = ''.join([char for char in text if char.isalnum() or char.isspace()])
    return text

In [15]:
def predict_sentiment(inferred_topics, model):
    # Example: If the sum of topic weights for offensive topics is greater than non-offensive, predict offensive sentiment
    offensive_topics_sum = inferred_topics[:, 0].sum()
    non_offensive_topics_sum = inferred_topics[:, 1].sum()

    if offensive_topics_sum > non_offensive_topics_sum:
        return 'Offensive'
    else:
        return 'Non-offensive'

In [17]:
from sklearn.ensemble import RandomForestClassifier

# Define and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

new_comment = "Your new comment dumb ass here"
preprocessed_new_comment = preprocess_text(new_comment)

# Convert the preprocessed comment into a bag of words representation
new_comment_vectorized = vectorizer.transform([preprocessed_new_comment])

# Use the trained LDA model to infer topics
new_comment_topics = lda_model.transform(new_comment_vectorized)

# Assuming you have a function to predict sentiment using the inferred topics and the trained model
predicted_sentiment = predict_sentiment(new_comment_topics, model)

print("Predicted sentiment for the new comment:", predicted_sentiment)

NameError: name 'X_train' is not defined