# Sentiment analysis of tweets

## Step 1 Data Preprocessing

In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv("dataset/twitter_training.csv")

In [3]:
RANDOM_STATE = 42

In [4]:
dataset.head()

Unnamed: 0,chatID,citeName,status,tweetText
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [5]:
dataset.shape

(74682, 4)

#### Text Cleaning

In [None]:
import nltk
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [7]:
# Define text preprocessing function
def preprocess_text(text):
    try:
        # Convert to lowercase
        text = text.lower()
        #print("Lowercased:", text)
        
        # Remove punctuation and numbers
        text = ''.join([char for char in text if char not in string.punctuation and not char.isdigit()])
        #print("No Punctuation/Numbers:", text)
        
        # Tokenize text
        tokens = word_tokenize(text)
        #print("Tokens:", tokens)
        
        # Remove stopwords and lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        #print("Lemmatized Tokens:", tokens)
        
        return ' '.join(tokens)
    except Exception as e:
        print(f"Error processing text: {text}, Error: {str(e)}")

In [8]:
dataset.isnull().sum()

chatID         0
citeName       0
status         0
tweetText    686
dtype: int64

In [9]:
# Convert non-string values to empty strings
dataset['tweetText'] = dataset['tweetText'].fillna('')

In [10]:
dataset.isnull().sum()

chatID       0
citeName     0
status       0
tweetText    0
dtype: int64

In [11]:
# Add double quotation marks around the tweetText column
dataset['tweetText'] = dataset['tweetText'].apply(lambda x: f'"{x}"' if not x.startswith('"') else x)

In [12]:
# Apply text preprocessing to the dataset
dataset['cleaned_text'] = dataset['tweetText'].apply(preprocess_text)

# Preview the cleaned text
print(dataset[['tweetText', 'cleaned_text']].head())

                                           tweetText  \
0  "im getting on borderlands and i will murder y...   
1  "I am coming to the borders and I will kill yo...   
2  "im getting on borderlands and i will kill you...   
3  "im coming on borderlands and i will murder yo...   
4  "im getting on borderlands 2 and i will murder...   

                   cleaned_text  
0  im getting borderland murder  
1            coming border kill  
2    im getting borderland kill  
3   im coming borderland murder  
4  im getting borderland murder  


## Step 2 Feature Engineering

### Step 2.1 Vectorization of Text Data by TF-IDF (Term Frequency-Inverse Document Frequency)

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer() 

# Fit and transform the tweetText column
X_tfidf = tfidf_vectorizer.fit_transform(dataset['tweetText'])


### Step 2.2 Handling Categorical Data by One-Hot Encoding

In [14]:
from sklearn.preprocessing import OneHotEncoder

# Initialize the OneHotEncoder
onehot_encoder = OneHotEncoder()

# Fit and transform the citeName column
X_citeName = onehot_encoder.fit_transform(dataset[['citeName']])


In [15]:
# Combining Features
from scipy.sparse import hstack

# Combine the features
X = hstack([X_tfidf, X_citeName])

### Step 2.3 Handling the Target Variable

In [16]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the status column
y = label_encoder.fit_transform(dataset['status'])

## Step 3 Model Building

### Step 3.1 Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X, y)

### Step 3.2 Testing the model

#### Step 3.2.1 Preprocessing the test set.

In [19]:
dataset_val = pd.read_csv("dataset/twitter_validation.csv")

In [20]:
dataset_val['tweetText'] = dataset_val['tweetText'].fillna('')

In [21]:
X_val_tfidf = tfidf_vectorizer.transform(dataset_val['tweetText'])

In [22]:
X_val_citeName = onehot_encoder.transform(dataset_val[['citeName']])

In [23]:
X_val = hstack([X_val_tfidf, X_val_citeName])

In [24]:
y_val = label_encoder.transform(dataset_val['status'])

#### Step 3.2.2 Making Predictions

In [25]:
# Predict on the validation set
y_pred = model.predict(X_val)

## 4 Evaluation

In [26]:
# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='weighted')
recall = recall_score(y_val, y_pred, average='weighted')
f1 = f1_score(y_val, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')

Accuracy: 0.9
Precision: 0.9006309001972567
Recall: 0.9
F1 Score: 0.8998731579581157


## 5. Testing the model on a whole new TweetText.

In [None]:
# Example new text
new_text = "I'm so excited about the new Borderlands game!"

# Preprocess the new text
new_text_cleaned = new_text.lower()  # Convert to lowercase
# Add any other preprocessing steps you used

# Vectorize the new text
new_text_tfidf = tfidf_vectorizer.transform([new_text_cleaned])
new_text_citeName = onehot_encoder.transform([['CS-GO']])  # Example citeName

# Combine the features
new_text_features = hstack([new_text_tfidf, new_text_citeName])

# Make predictions
predicted_sentiment = model.predict(new_text_features)

# Decode the predicted sentiment
predicted_sentiment_label = label_encoder.inverse_transform(predicted_sentiment)

In [32]:
print(f'The predicted sentiment is: {predicted_sentiment_label[0]}')

The predicted sentiment is: Negative
