<a href="https://colab.research.google.com/github/Divyank7436/twitterSentimentAnalysis/blob/main/twitterSentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords
nltk.download('stopwords')

# Step 2: Load the dataset
# Replace 'your_dataset.csv' with the path to your dataset
df = pd.read_csv('twitter_training.csv', encoding='ISO-8859-1')

# Display first few rows
print("Dataset sample:")
print(df.head())





Dataset sample:
   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  
0  I am coming to the borders and I will kill you...     
1  im getting on borderlands and i will kill you ...     
2  im coming on borderlands and i will murder you...     
3  im getting on borderlands 2 and i will murder ...     
4  im getting into borderlands and i can murder y...     


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
print("Column names in the dataset:", df.columns)


Column names in the dataset: Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')


In [6]:
print(df.head())
print("Column names in the dataset:", df.columns)


   2401  Borderlands  Positive  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

  im getting on borderlands and i will murder you all ,  \
0  I am coming to the borders and I will kill you...      
1  im getting on borderlands and i will kill you ...      
2  im coming on borderlands and i will murder you...      
3  im getting on borderlands 2 and i will murder ...      
4  im getting into borderlands and i can murder y...      

                      clean_text  
0            coming borders kill  
1    im getting borderlands kill  
2   im coming borderlands murder  
3  im getting borderlands murder  
4  im getting borderlands murder  
Column names in the dataset: Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,', 'clean_text'],
      dtype='object')


In [7]:


# Step 3: Data Preprocessing
# Assuming the text data is in the last column and sentiment in 'Positive' column
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    text = text.split()  # Tokenize
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(text)

# Apply preprocessing to the dataset
# Convert the column to strings and handle missing values
df['im getting on borderlands and i will murder you all ,'] = df['im getting on borderlands and i will murder you all ,'].astype(str).fillna('')

# Apply preprocessing to the dataset
df['clean_text'] = df['im getting on borderlands and i will murder you all ,'].apply(preprocess_text)


In [8]:



# Step 4: Split the data into training and test sets
X = df['clean_text']
y = df['Positive']  # Adjust based on the column containing sentiment labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [9]:


# Step 5: Feature extraction using CountVectorizer and TF-IDF
count_vect = CountVectorizer(max_features=5000)
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)



In [10]:

# Step 6: Train the model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)



In [11]:


# Transform the test data
X_test_counts = count_vect.transform(X_test)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Step 7: Make predictions and evaluate the model
y_pred = model.predict(X_test_tfidf)

print("\nModel Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))




Model Accuracy: 0.6573064045793282

Classification Report:
              precision    recall  f1-score   support

  Irrelevant       0.78      0.41      0.54      1794
    Negative       0.69      0.74      0.71      2535
     Neutral       0.68      0.57      0.62      2199
    Positive       0.59      0.82      0.69      2731

    accuracy                           0.66      9259
   macro avg       0.69      0.63      0.64      9259
weighted avg       0.68      0.66      0.65      9259



In [12]:

# Step 8: Test with new data (optional)
new_tweet = "I love this new feature on Twitter!"
processed_tweet = preprocess_text(new_tweet)
tweet_tfidf = tfidf_transformer.transform(count_vect.transform([processed_tweet]))
print("\nPredicted Sentiment for new tweet:", model.predict(tweet_tfidf)[0])


Predicted Sentiment for new tweet: Positive


In [13]:


# Step 9: Test with another new data (optional)
new_tweet = "the world is so fucked up!"
processed_tweet = preprocess_text(new_tweet)
tweet_tfidf = tfidf_transformer.transform(count_vect.transform([processed_tweet]))
print("\nPredicted Sentiment for new tweet:", model.predict(tweet_tfidf)[0])


Predicted Sentiment for new tweet: Negative
