#Load Necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

#Load the dataset

In [None]:
data = pd.read_csv("/content/tweets.csv")
data.head()

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...
3,4,0,I'm wired I know I'm George I was made that wa...
4,5,1,What amazing service! Apple won't even talk to...


#Data Preprocessing

In [None]:
print(data.isnull().sum())


id       0
label    0
tweet    0
dtype: int64


In [None]:
# Clean the text data (remove punctuation, stopwords, convert to lowercase, etc.).
# Tokenize and vectorize the text using techniques like TF-IDF or Count Vectorizer.
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

data['cleaned_text'] = data['tweet'].apply(preprocess_text)
print("\nText preprocessing completed!")


Text preprocessing completed!


In [None]:
# Display a sample of cleaned text
print("\nSample of cleaned text:")
print(data[['tweet', 'cleaned_text']].head())


Sample of cleaned text:
                                               tweet  \
0  #fingerprint #Pregnancy Test https://goo.gl/h1...   
1  Finally a transparant silicon case ^^ Thanks t...   
2  We love this! Would you go? #talk #makememorie...   
3  I'm wired I know I'm George I was made that wa...   
4  What amazing service! Apple won't even talk to...   

                                        cleaned_text  
0  fingerprint pregnancy test android apps beauti...  
1  finally transparant silicon case thanks uncle ...  
2  love would go talk makememories unplug relax i...  
3  wired know george made way iphone cute daventr...  
4  amazing service apple even talk question unles...  


#Exploratory Data Analysis (EDA)

In [None]:
# Check for missing values in the dataset
print("\nMissing values in the dataset:")
print(data.isnull().sum())

# Ensure the 'sentiment' column exists and has no missing values
if 'sentiment' in data.columns:
    if data['sentiment'].isnull().any():
        print("\nThe 'sentiment' column contains missing values. Filling them with 'unknown'.")
        data['sentiment'] = data['sentiment'].fillna('unknown')

    # Check the unique values in the 'sentiment' column
    print("\nUnique values in the 'sentiment' column:")
    print(data['sentiment'].unique())

    # Check the distribution of sentiments
    sns.countplot(data['sentiment'], palette='viridis')
    plt.title('Distribution of Sentiments')
    plt.xlabel('Sentiment')
    plt.ylabel('Count')
    plt.show()
else:
    print("\nThe 'sentiment' column is missing. Ensure the dataset has the required column.")



Missing values in the dataset:
id              0
label           0
tweet           0
cleaned_text    0
dtype: int64

The 'sentiment' column is missing. Ensure the dataset has the required column.


In [None]:
# Check the distribution of sentiments
sns.countplot(data['sentiment'], palette='viridis')
plt.title('Distribution of Sentiments')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()


# Generate a WordCloud for positive and negative sentiments
positive_text = ' '.join(data[data['sentiment'] == 'positive']['cleaned_text'])
negative_text = ' '.join(data[data['sentiment'] == 'negative']['cleaned_text'])

plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.title('Positive Sentiments')
wordcloud_pos = WordCloud(width=400, height=200, background_color='white').generate(positive_text)
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis('off')

plt.subplot(1, 2, 2)
plt.title('Negative Sentiments')
wordcloud_neg = WordCloud(width=400, height=200, background_color='white').generate(negative_text)
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.axis('off')
plt.show()

# 4. Train-Test Split
X = data['cleaned_text']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData split into training and testing sets.")

# 5. Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
print("\nText vectorization completed!")

# 6. Model Training
model = LogisticRegression()
model.fit(X_train_vec, y_train)
print("\nModel training completed!")

# 7. Evaluation
y_pred = model.predict(X_test_vec)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# 8. Save the Model and Vectorizer
import pickle
model_path = '/mnt/data/sentiment_model.pkl'
vectorizer_path = '/mnt/data/vectorizer.pkl'
with open(model_path, 'wb') as model_file:
    pickle.dump(model, model_file)
with open(vectorizer_path, 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

print("\nModel and vectorizer saved successfully!")
