In [None]:
import pandas as pd
# Load the dataset (replace with your file path)
df = pd.read_csv("twitter_sentiment.csv")
# Preview the dataset
df.head()

import tweepy
# Set up authentication keys (replace with your own credentials)
consumer_key = 'your_consumer_key'
consumer_secret = 'your_consumer_secret'
access_token = 'your_access_token'
access_token_secret = 'your_access_token_secret'
# Authenticate to Twitter
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
# Collect tweets (e.g., search for tweets with the hashtag #AI)
tweets = api.search_tweets(q="#AI", count=100, lang="en")  # Use search_tweets() for Tweepy v4.0 and above
# Convert the tweets into a pandas DataFrame
data = [{'tweet': tweet.text, 'created_at': tweet.created_at} for tweet in tweets]
df = pd.DataFrame(data)
# Preview the collected tweets
df.head()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
# Download necessary NLTK resources (if not already done)
nltk.download('punkt')
nltk.download('stopwords')
# Preprocess function
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    # Remove non-alphabetic characters (punctuation, numbers)
    text = re.sub(r'[^a-z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_text'] = df['tweet'].apply(preprocess_text)  # Use 'tweet' column instead of 'text'
# Preview the cleaned text
df[['tweet', 'cleaned_text']].head()

from sklearn.feature_extraction.text import TfidfVectorizer
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
# Vectorize the cleaned text
X = tfidf_vectorizer.fit_transform(df['cleaned_text'])
# Check the shape of the resulting matrix
print(X.shape)  # (number of samples, number of features)

from sklearn.model_selection import train_test_split
# Ensure 'sentiment' column exists in your dataset
if 'sentiment' not in df.columns:
    raise ValueError("Dataset must contain a 'sentiment' column")

# Labels (sentiment)
y = df['sentiment']
# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Check the shape of the split data
print(f"Training data size: {X_train.shape[0]}")
print(f"Testing data size: {X_test.shape[0]}")

from sklearn.neighbors import KNeighborsClassifier
# Initialize the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
# Train the classifier
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Display confusion matrix
import seaborn as sns
import matplotlib.pyplot as plt
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues",
            xticklabels=['negative', 'neutral', 'positive'],
            yticklabels=['negative', 'neutral', 'positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

from wordcloud import WordCloud
# Generate word clouds for positive, negative, and neutral tweets
positive_tweets = ' '.join(df[df['sentiment'] == 'positive']['cleaned_text'])
negative_tweets = ' '.join(df[df['sentiment'] == 'negative']['cleaned_text'])
neutral_tweets = ' '.join(df[df['sentiment'] == 'neutral']['cleaned_text'])
# Create word clouds
positive_wc = WordCloud(width=800, height=400).generate(positive_tweets)
negative_wc = WordCloud(width=800, height=400).generate(negative_tweets)
neutral_wc = WordCloud(width=800, height=400).generate(neutral_tweets)
# Plot the word clouds
plt.figure(figsize=(12, 8))
plt.subplot(131)
plt.imshow(positive_wc, interpolation='bilinear')
plt.title("Positive Sentiment")
plt.axis('off')
plt.subplot(132)
plt.imshow(negative_wc, interpolation='bilinear')
plt.title("Negative Sentiment")
plt.axis('off')
plt.subplot(133)
plt.imshow(neutral_wc, interpolation='bilinear')
plt.title("Neutral Sentiment")
plt.axis('off')
plt.show()

