<a href="https://colab.research.google.com/github/Deepureddi/project/blob/main/mini_project(twitter_analysis).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from textblob import TextBlob

# Load the dataset
data = pd.read_csv('/content/sample_data/twitter_training.csv', header=None) # Add header=None as the first row is not a header

# Print column names to verify
print("Columns in your dataset:", data.columns)

# Use correct column names based on your dataset
# Based on the previous output, assuming the last column is tweet text and the third is sentiment
data = data[[2, 3]] # Select the 3rd and 4th columns (0-indexed)
data.columns = ['category', 'clean_text']  # Rename for convenience, putting category first based on the order in the dataframe

# Drop any rows with missing tweet text
data = data.dropna(subset=['clean_text'])

# Define function to get sentiment using TextBlob
def analyze_sentiment(tweet):
    analysis = TextBlob(str(tweet))
    polarity = analysis.sentiment.polarity
    if polarity > 0:
        return 1.0  # Positive
    elif polarity == 0:
        return 0.0  # Neutral
    else:
        return -1.0  # Negative

# Apply the sentiment analysis to tweets
data['predicted_sentiment'] = data['clean_text'].apply(analyze_sentiment)

# Map actual sentiment categories to numerical values
sentiment_map = {
    'Positive': 1.0,
    'Neutral': 0.0,
    'Negative': -1.0,
    'Irrelevant': 2.0 # Add irrelevant as seen in data description, though it might not be used for accuracy calculation later
}

# Filter only rows with valid sentiment categories
data = data[data['category'].isin(sentiment_map.keys())]

# Map actual categories to numerical
data['actual_sentiment'] = data['category'].map(sentiment_map)

# Compare predicted with actual for relevant sentiments
# Exclude 'Irrelevant' from accuracy calculation if needed, depending on the goal
# For now, compare all mapped values
data['correct'] = data['predicted_sentiment'] == data['actual_sentiment']

# Calculate accuracy - only for relevant sentiments
# Filter data to exclude 'Irrelevant' sentiment if calculating accuracy only on Positive/Neutral/Negative
relevant_data = data[data['category'].isin(['Positive', 'Neutral', 'Negative'])]
accuracy = relevant_data['correct'].mean()


print(f"\nAccuracy of sentiment analysis (excluding 'Irrelevant'): {accuracy * 100:.2f}%")

# Display a few results
print("\nSample predictions:")
print(relevant_data[['clean_text', 'category', 'predicted_sentiment', 'correct']].head())

Columns in your dataset: Index([0, 1, 2, 3], dtype='int64')

Accuracy of sentiment analysis (excluding 'Irrelevant'): 48.29%

Sample predictions:
                                          clean_text  category  \
0  im getting on borderlands and i will murder yo...  Positive   
1  I am coming to the borders and I will kill you...  Positive   
2  im getting on borderlands and i will kill you ...  Positive   
3  im coming on borderlands and i will murder you...  Positive   
4  im getting on borderlands 2 and i will murder ...  Positive   

   predicted_sentiment  correct  
0                  0.0    False  
1                  0.0    False  
2                  0.0    False  
3                  0.0    False  
4                  0.0    False  
