In [66]:
# Import modules
import zipfile
import pandas as pd
import os
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, GlobalMaxPooling1D, Conv1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [55]:
# Get twitter data files
zip_path = './archive (1).zip'  
current_directory = os.getcwd()

# Create a directory to extract files to if it doesn't exist
os.makedirs(current_directory, exist_ok=True)

# Extract the zip file to the current directory
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(current_directory)

# Define the paths to the CSV files
training_file_path = os.path.join(current_directory, 'twitter_training.csv')
validation_file_path = os.path.join(current_directory, 'twitter_validation.csv')

# Read the CSV files into pandas DataFrames
training_data = pd.read_csv(training_file_path)
validation_data = pd.read_csv(validation_file_path)

# Display the first few rows of the datasets
training_data.head()


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [56]:
# Prepare data with a function
def prepare_data(df):
    df.columns = ["tweet id", "entity", "sentiment", "content"]
    df.drop(["tweet id", "entity"], inplace=True, axis=1)

    # Encode label sentiment
    lb = LabelEncoder()
    df["label_sentiment"] = lb.fit_transform(df["sentiment"])

    # Drop any null values
    df.dropna(inplace=True)

    return df

# Apply to data
training_data = prepare_data(training_data)
validation_data = prepare_data(validation_data)
training_data.head()

Unnamed: 0,sentiment,content,label_sentiment
0,Positive,I am coming to the borders and I will kill you...,3
1,Positive,im getting on borderlands and i will kill you ...,3
2,Positive,im coming on borderlands and i will murder you...,3
3,Positive,im getting on borderlands 2 and i will murder ...,3
4,Positive,im getting into borderlands and i can murder y...,3


In [57]:
# Text preprocessing function
nltk.download('wordnet')
nltk.download('stopwords')
wnl = WordNetLemmatizer()
stop_words = stopwords.words('english')

def preprocess_text(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    text = text.split()
    text = " ".join([wnl.lemmatize(word) for word in text if not word in stop_words and len(word) >= 3])
    return text.lower()

# Apply preprocessing
training_data['content'] = training_data['content'].apply(preprocess_text)
validation_data['content'] = validation_data['content'].apply(preprocess_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cmondy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cmondy\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [63]:
 # Tokenize and Pad the Text Data
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(training_data['content'])

X_train_seq = tokenizer.texts_to_sequences(training_data['content'])
X_val_seq = tokenizer.texts_to_sequences(validation_data['content'])

X_train_padded = pad_sequences(X_train_seq, maxlen=100, padding='post', truncating='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=100, padding='post', truncating='post')

# Encode the Labels
y_train = training_data['label_sentiment'].values
y_val = validation_data['label_sentiment'].values


In [67]:
# Build and Compile a More Complex Model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=100),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(4, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [68]:
# Train the model
history = model.fit(X_train_padded, y_train, epochs=10, batch_size=32, validation_data=(X_val_padded, y_val))

Epoch 1/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 5ms/step - accuracy: 0.4954 - loss: 1.1451 - val_accuracy: 0.8539 - val_loss: 0.4419
Epoch 2/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.7965 - loss: 0.5637 - val_accuracy: 0.9259 - val_loss: 0.2455
Epoch 3/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.8814 - loss: 0.3314 - val_accuracy: 0.9369 - val_loss: 0.2295
Epoch 4/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9131 - loss: 0.2407 - val_accuracy: 0.9339 - val_loss: 0.2628
Epoch 5/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9265 - loss: 0.1958 - val_accuracy: 0.9479 - val_loss: 0.2880
Epoch 6/10
[1m2313/2313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 5ms/step - accuracy: 0.9337 - loss: 0.1710 - val_accuracy: 0.9379 - val_loss: 0.3544
Epoch 7/10

In [73]:
# Evaluate the model
loss, accuracy = model.evaluate(X_val_padded, y_val)
print(f'Test Accuracy: {accuracy * 100}')

[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9447 - loss: 0.4736
Test Accuracy: 94.79479193687439
