In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
# Ensure the required NLTK data is downloaded
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


# Load data from CSV file
file_path = 'Twitter_Data.csv'  # Replace with the actual path to your CSV file
df = pd.read_csv(file_path)

# 1. Sentiment Analysis

# Function to clean text
def clean_text(text):
    if isinstance(text, str):  # Ensure the entry is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
        text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        return text
    else:
        return ""  # Return empty string for non-string entries

# Apply cleaning function to the text data
df['clean_text'] = df['clean_text'].apply(clean_text)

# Function to get sentiment
def get_sentiment(text):
    blob = TextBlob(text)
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return 1
    elif sentiment < 0:
        return -1
    else:
        return 0

# Apply sentiment analysis
df['predicted_category'] = df['clean_text'].apply(get_sentiment)

# Display the first few rows of the DataFrame
print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ANKIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANKIT\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANKIT\AppData\Roaming\nltk_data...


                                          clean_text  category  \
0  when modi promised minimum government maximum ...      -1.0   
1  talk all the nonsense and continue all the dra...       0.0   
2  what did just say vote for modi welcome bjp to...       1.0   
3  asking his supporters prefix chowkidar their n...       1.0   
4  answer who among these the most powerful world...       1.0   

   predicted_category  
0                  -1  
1                   0  
2                   1  
3                   1  
4                   1  


In [11]:
# Function to clean and preprocess text
def preprocess_text(text):
    if isinstance(text, str):  # Ensure the entry is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'http\S+', '', text)  # Remove URLs
        text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
        text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
        
        # Tokenization
        tokens = word_tokenize(text)
        
        # Remove stop words
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        
        return ' '.join(tokens)
    else:
        return ""  # Return empty string for non-string entries

# Apply preprocessing function to the text data
df['clean_text'] = df['clean_text'].apply(preprocess_text)

# Display the first few rows of the DataFrame
print(df.head())

# Vectorization using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['clean_text'])

print("x-shape - ",X.shape)


                                          clean_text  category  \
0  modi promised minimum government maximum gover...      -1.0   
1             talk nonsense continue drama vote modi       0.0   
2  say vote modi welcome bjp told rahul main camp...       1.0   
3  asking supporter prefix chowkidar name modi gr...       1.0   
4  answer among powerful world leader today trump...       1.0   

   predicted_category  
0                  -1  
1                   0  
2                   1  
3                   1  
4                   1  
x-shape -  (162980, 1000)


In [12]:
# Assuming df['category'] contains the sentiment labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['category'], test_size=0.2, random_state=42)

# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

NameError: name 'train_test_split' is not defined