In [12]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import pickle

# Download the stopwords corpus from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove words next to '@' (usernames)
    text = re.sub(r'@\w+', '', text)
    
    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenize and remove stopwords
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    # Join the words back into a single string
    return ' '.join(words)

# Load the dataset from a CSV file
df = pd.read_csv('twitter_parsed_dataset.csv')

# Replace NaN values in the 'Text' column with an empty string
df['Text'].fillna('', inplace=True)
df.dropna(inplace=True)
# Replace 'none' annotation labels with 'neutral'
df['Annotation'] = df['Annotation'].replace('none', 'neutral')
print(df['Annotation'].unique)
# Apply the preprocessing function to the 'Text' column
df['Processed_Text'] = df['Text'].apply(preprocess_text)

# Vectorize the text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Processed_Text'])

# Encode the target variable (Annotation)
y = df['Annotation']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Save the trained model and the vectorizer
with open('naive_bayes_model.pkl', 'wb') as model_file:
    pickle.dump(nb_model, model_file)

with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

# Load the saved model and vectorizer
with open('naive_bayes_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('vectorizer.pkl', 'rb') as vec_file:
    loaded_vectorizer = pickle.load(vec_file)

# Input your own comment for prediction
user_input = input("Enter a comment: ")

# Preprocess the user input
user_input_processed = loaded_vectorizer.transform([preprocess_text(user_input)])

# Predict the annotation label for the input comment
prediction = loaded_model.predict(user_input_processed)

# Display the predicted annotation label
print(f"The annotation label for the given comment is: {prediction[0]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vaisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<bound method Series.unique of 0        neutral
1        neutral
2         sexism
3         racism
4        neutral
          ...   
16846    neutral
16847    neutral
16848    neutral
16849    neutral
16850    neutral
Name: Annotation, Length: 16848, dtype: object>
The annotation label for the given comment is: neutral


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vaisa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ValueError: All arrays must be of the same length

In [13]:

# Load the saved model and vectorizer
with open('naive_bayes_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

with open('vectorizer.pkl', 'rb') as vec_file:
    loaded_vectorizer = pickle.load(vec_file)

# Input your own comment for prediction
user_input = input("Enter a comment: ")

# Preprocess the user input
user_input_processed = loaded_vectorizer.transform([preprocess_text(user_input)])

# Predict the annotation label for the input comment
prediction = loaded_model.predict(user_input_processed)

# Display the predicted annotation label
print(f"The annotation label for the given comment is: {prediction[0]}")


The annotation label for the given comment is: neutral
