# Data Preprocessing ANN Implementation

Importing Dataset and Combining 

In [5]:
import pandas as pd

df1 = pd.read_csv('train.csv',encoding='latin1')
df2 = pd.read_csv('test.csv',encoding='latin1')

# Merge the DataFrames
train_data = pd.concat([df1, df2], ignore_index=True)

# Write the merged DataFrame to a new CSV file
train_data.to_csv('merged_file.csv', index=False)
print(train_data.head())


       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  
0  Afghanistan        38928346.0         652860.0    

Removing the Unnecessary columns

In [6]:
columns_to_remove = ['textID', 'selected_text', 'Time of Tweet', 'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)', 'Density (P/Km²)']
train_data.drop(columns=columns_to_remove, inplace=True)


In [7]:
train_data

Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative
...,...,...
32291,,
32292,,
32293,,
32294,,


In [5]:
# Check for missing values
missing_values = train_data.isnull().sum()
print("Missing Values:\n", missing_values)

# Check for duplicates
duplicate_rows = train_data.duplicated().sum()
print("\nDuplicate Rows:", duplicate_rows)

Missing Values:
 text         1282
sentiment    1281
dtype: int64

Duplicate Rows: 1280


In [8]:
train_data.dropna(subset=['text'], inplace=True)
# train_data.dropna(subset=['text_lower'],inplace=True)


In [9]:
# Lowercase Conversion
train_data['text'] = train_data['text'].str.lower()


In [10]:
train_data

Unnamed: 0,text,sentiment
0,"i`d have responded, if i were going",neutral
1,sooo sad i will miss you here in san diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"sons of ****, why couldn`t they put them on t...",negative
...,...,...
31010,"its at 3 am, im very tired but i can`t sleep ...",negative
31011,all alone in this old house again. thanks for...,positive
31012,i know what you mean. my little dog is sinkin...,negative
31013,_sutra what is your next youtube video gonna b...,positive


*****Text cleaning***

In [12]:
import re

def clean_text(text):
    if isinstance(text, str):  # Check if text is a string
        # Remove special characters, HTML tags, and links
        cleaned_text = re.sub(r"<.*?>", "", text)  # Remove HTML tags
        cleaned_text = re.sub(r"http\S+|www\.\S+", "", cleaned_text)  # Remove links
        cleaned_text = re.sub(r"[^a-zA-Z0-9\s]", "", cleaned_text)  # Remove special characters
        return cleaned_text.lower()  # Convert text to lowercase
    else:
        return text  # Return unchanged if not a string

# Apply text cleaning to 'text' column
train_data['text'] = train_data['text'].apply(clean_text)
train_data

Unnamed: 0,text,sentiment
0,id have responded if i were going,neutral
1,sooo sad i will miss you here in san diego,negative
2,my boss is bullying me,negative
3,what interview leave me alone,negative
4,sons of why couldnt they put them on the rel...,negative
...,...,...
31010,its at 3 am im very tired but i cant sleep bu...,negative
31011,all alone in this old house again thanks for ...,positive
31012,i know what you mean my little dog is sinking...,negative
31013,sutra what is your next youtube video gonna be...,positive


**Stopwords Removal: Remove common stopwords while preserving the links.**

In [11]:
import requests

# Download the stopwords file
url = "https://gist.githubusercontent.com/ZohebAbai/513218c3468130eacff6481f424e4e64/raw/b70776f341a148293ff277afa0d0302c8c38f7e2/gist_stopwords.txt"
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Extract stopwords from the content
    stopwords = response.text.split(",")
else:
    print("Failed to download stopwords file.")

# Stopwords removal function
def remove_stopwords(text):
    if isinstance(text, str):
        # Split the text into tokens using whitespace as delimiter
        tokens = text.split()
        # Remove stopwords from the tokens
        filtered_tokens = [word for word in tokens if word.lower() not in stopwords]
        # Join the filtered tokens back into a string
        filtered_text = ' '.join(filtered_tokens)
        return filtered_text
    else:
        return text





# Applying the stopwords removal function to the 'text' column in the train_data DataFrame
train_data['text'] = train_data['text'].apply(remove_stopwords)


In [12]:
train_data

Unnamed: 0,text,sentiment,text_lower,tokens,text_without_stopwords
0,id have responded if i were going,neutral,"i`d have responded, if i were going","[id, have, responded, if, i, were, going]",responded
1,sooo sad i will miss you here in san diego,negative,sooo sad i will miss you here in san diego!!!,"[sooo, sad, i, will, miss, you, here, in, san,...",sooo sad san diego
2,my boss is bullying me,negative,my boss is bullying me...,"[my, boss, is, bullying, me]",boss bullying
3,what interview leave me alone,negative,what interview! leave me alone,"[what, interview, leave, me, alone]",interview leave
4,sons of why couldnt they put them on the rel...,negative,"sons of ****, why couldn`t they put them on t...","[sons, of, why, couldnt, they, put, them, on, ...",sons releases bought
...,...,...,...,...,...
31010,its at 3 am im very tired but i cant sleep bu...,negative,"its at 3 am, im very tired but i can`t sleep ...","[its, at, 3, am, im, very, tired, but, i, cant...",3 tired sleep
31011,all alone in this old house again thanks for ...,positive,all alone in this old house again. thanks for...,"[all, alone, in, this, old, house, again, than...",house net alive kicking invented net wanna kis...
31012,i know what you mean my little dog is sinking...,negative,i know what you mean. my little dog is sinkin...,"[i, know, what, you, mean, my, little, dog, is...",dog sinking depression someplace tropical
31013,sutra what is your next youtube video gonna be...,positive,_sutra what is your next youtube video gonna b...,"[sutra, what, is, your, next, youtube, video, ...",sutra youtube video gonna love videos


In [21]:
from sklearn.preprocessing import LabelEncoder

# Encode the sentiment labels
label_encoder = LabelEncoder()
train_data['sentiment'] = label_encoder.fit_transform(train_data['sentiment'])


In [22]:
class CustomTokenizer:
    def __init__(self, num_words=None):
        self.num_words = num_words
        self.word_to_index = {}
        self.index_to_word = {}
        self.word_counts = {}
        self.index = 1  # Start index from 1 (0 reserved for padding)

    def fit_on_texts(self, texts):
        for text in texts:
            for word in text.split():
                if word not in self.word_counts:
                    self.word_counts[word] = 1
                else:
                    self.word_counts[word] += 1

        # Sort words by frequency and select top num_words if specified
        sorted_words = sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True)
        if self.num_words:
            sorted_words = sorted_words[:self.num_words]

        # Assign index to each word
        for word, _ in sorted_words:
            self.word_to_index[word] = self.index
            self.index_to_word[self.index] = word
            self.index += 1

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = [self.word_to_index[word] for word in text.split() if word in self.word_to_index]
            sequences.append(sequence)
        return sequences





Final Python Script with all the implementations

In [23]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense
from keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences


class CustomTokenizer:
    def __init__(self, num_words=None):
        self.num_words = num_words
        self.word_to_index = {}
        self.index_to_word = {}
        self.word_counts = {}
        self.index = 1  # Start index from 1 (0 reserved for padding)

    def fit_on_texts(self, texts):
        for text in texts:
            for word in text.split():
                if word not in self.word_counts:
                    self.word_counts[word] = 1
                else:
                    self.word_counts[word] += 1

        # Sort words by frequency and select top num_words if specified
        sorted_words = sorted(self.word_counts.items(), key=lambda x: x[1], reverse=True)
        if self.num_words:
            sorted_words = sorted_words[:self.num_words]

        # Assign index to each word
        for word, _ in sorted_words:
            self.word_to_index[word] = self.index
            self.index_to_word[self.index] = word
            self.index += 1

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            sequence = [self.word_to_index[word] for word in text.split() if word in self.word_to_index]
            sequences.append(sequence)
        return sequences



# Define a custom preprocessing function
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, punctuation, and numbers
    text = re.sub(r"[^a-zA-Z]", " ", text)
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Join tokens back into a string
    clean_text = " ".join(tokens)
    return clean_text
import pandas as pd

df1 = pd.read_csv('train.csv',encoding='latin1')
df2 = pd.read_csv('test.csv',encoding='latin1')

# Merge the DataFrames
twitter_data = pd.concat([df1, df2], ignore_index=True)

# Write the merged DataFrame to a new CSV file
twitter_data.to_csv('merged_file.csv', index=False)

# Drop rows with NaN or null values in the 'text' column
twitter_data.dropna(subset=['text'], inplace=True)

# Apply the preprocessing function to the 'text' column
twitter_data['text'] = twitter_data['text'].apply(preprocess_text)

# Create an instance of CustomTokenizer
custom_tokenizer = CustomTokenizer(num_words=5000)
custom_tokenizer.fit_on_texts(twitter_data['text'])

# Convert sentiment labels to numeric form
sentiment_mapping = {'neutral': 0, 'positive': 1, 'negative': 2}
twitter_data['sentiment_encoded'] = twitter_data['sentiment'].map(sentiment_mapping)

# Split the dataset into training and testing sets
X = twitter_data['text']
y = twitter_data['sentiment_encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert texts to sequences
X_train_seq = custom_tokenizer.texts_to_sequences(X_train)
X_test_seq = custom_tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
maxlen = 100
X_train_padded = pad_sequences(X_train_seq, maxlen=maxlen)
X_test_padded = pad_sequences(X_test_seq, maxlen=maxlen)

# Convert labels to categorical format
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)

# Define a function to create the model with specified hyperparameters
def create_model(optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=len(custom_tokenizer.word_to_index) + 1, output_dim=100, input_length=maxlen))
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Define the hyperparameters to tune
optimizers = ['adam', 'rmsprop', 'sgd']
epochs_list = [2,3, 5, 8]
batch_sizes = [64, 128, 256]

# Perform hyperparameter tuning
best_accuracy = 0
best_hyperparameters = {}

for optimizer in optimizers:
    for epochs in epochs_list:
        for batch_size in batch_sizes:
            print(f"Training model with optimizer: {optimizer}, epochs: {epochs}, batch_size: {batch_size}")
            
            # Create and compile the model with current hyperparameters
            model = create_model(optimizer=optimizer)
            
            # Train the model
            history = model.fit(X_train_padded, y_train_categorical, epochs=epochs, batch_size=batch_size, validation_split=0.3, verbose=1)
            
            # Evaluate the model on validation data
            _, accuracy = model.evaluate(X_test_padded, y_test_categorical, verbose=0)
            print(f"Validation Accuracy: {accuracy}")
            
            # Update best accuracy and hyperparameters if needed
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_hyperparameters = {'optimizer': optimizer, 'epochs': epochs, 'batch_size': batch_size}

print("Best hyperparameters:", best_hyperparameters)
print("Best validation accuracy:", best_accuracy)


Training model with optimizer: adam, epochs: 2, batch_size: 64
Epoch 1/2
Epoch 2/2
Validation Accuracy: 0.6843462586402893
Training model with optimizer: adam, epochs: 2, batch_size: 128
Epoch 1/2
Epoch 2/2
Validation Accuracy: 0.6637111306190491
Training model with optimizer: adam, epochs: 2, batch_size: 256
Epoch 1/2
Epoch 2/2
Validation Accuracy: 0.6827341318130493
Training model with optimizer: adam, epochs: 3, batch_size: 64
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Accuracy: 0.644688069820404
Training model with optimizer: adam, epochs: 3, batch_size: 128
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Accuracy: 0.6748347282409668
Training model with optimizer: adam, epochs: 3, batch_size: 256
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Accuracy: 0.6814444661140442
Training model with optimizer: adam, epochs: 5, batch_size: 64
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Validation Accuracy: 0.6384007930755615
Training model with optimizer: adam, epochs: 5, batch_size: 128
Epoch 1/5
Ep

KeyboardInterrupt: 