# Imports


In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet 
from os import walk
from nltk.corpus import stopwords
from nltk.corpus import words
import csv
from string import punctuation
import re
from bs4 import BeautifulSoup as beauty
from gensim.models import KeyedVectors
from itertools import chain

nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('words')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/stu4/s12/asg9582/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/stu4/s12/asg9582/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/stu4/s12/asg9582/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /home/stu4/s12/asg9582/nltk_data...
[nltk_data]   Package words is already up-to-date!


# Read Data

In [2]:
tweets1 = pd.read_csv('datasets/tweets-1.csv')
tweets2 = pd.read_csv('datasets/tweets-2.csv')

unique_sentiments_tweets1 = tweets1['sentiment'].value_counts()
unique_sentiments_tweets2 = tweets2['category'].value_counts()
print("Unique sentiment counts in tweets1:")
print(unique_sentiments_tweets1)
print("\nUnique sentiment counts in tweets2:")
print(unique_sentiments_tweets2)
display(tweets1)
display(tweets2)

Unique sentiment counts in tweets1:
neutral     11118
positive     8582
negative     7781
Name: sentiment, dtype: int64

Unique sentiment counts in tweets2:
 1.0    72250
 0.0    55213
-1.0    35510
Name: category, dtype: int64


Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative
...,...,...,...,...
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


# Tweets to AT-LSTM Cleaning

In [3]:
tweets1_atlstm = tweets1[tweets1['sentiment'] != 'neutral'][['text', 'sentiment']].reset_index(drop=True)


tweets2_atlstm = tweets2[tweets2['category'] != 0][['clean_text', 'category']].copy()
tweets2_atlstm['sentiment'] = tweets2_atlstm['category'].map({-1.0: 'negative', 0.0: 'neutral', 1.0: 'positive'})
tweets2_atlstm = tweets2_atlstm[['clean_text', 'sentiment']].reset_index(drop=True)
tweets2_atlstm.rename(columns={'clean_text': 'text'}, inplace=True)

unique_sentiments_tweets1 = tweets1_atlstm['sentiment'].value_counts()
unique_sentiments_tweets2 = tweets2_atlstm['sentiment'].value_counts()
tweets1_atlstm.dropna(inplace=True)
tweets2_atlstm.dropna(inplace=True)
combined_df = pd.concat([tweets1_atlstm, tweets2_atlstm])
print(combined_df['sentiment'].value_counts())
missing_values_count = combined_df.isnull().sum()
print(missing_values_count)
combined_df.rename(columns={'text': 'review'}).to_csv('datasets/atlstm_tweets.csv', index=False)\

display(combined_df)


positive    80831
negative    43290
Name: sentiment, dtype: int64
text         0
sentiment    0
dtype: int64


Unnamed: 0,text,sentiment
0,Sooo SAD I will miss you here in San Diego!!!,negative
1,my boss is bullying me...,negative
2,what interview! leave me alone,negative
3,"Sons of ****, why couldn`t they put them on t...",negative
4,2am feedings for the baby are fun when he is a...,positive
...,...,...
107762,engine growth modi unveils indias first 12000 ...,positive
107763,modi promised 2014 lok sabha elections that be...,positive
107764,why these 456 crores paid neerav modi not reco...,negative
107765,dear rss terrorist payal gawar what about modi...,negative


# Tweets to Bi_LSTM and ANN

In [4]:

def remove_html_tags(row):
    return beauty(row, 'html.parser').text


def tokenize(input_text):
    tokens = re.sub('[^a-zA-Z]', ' ', input_text).lower().split()

    # tokens = word_tokenizer.tokenize(input_text)
    return tokens


def remove_stop_words(input_text_vector):
    filtered = []
    for word in input_text_vector:
        if word.isalpha() and word not in stop_words:
            filtered.append(word)
    return filtered


def standardize(input_label):
    return 1 if input_label == 'positive' else 0


def all_at_once(input_text):
    cleaned = remove_html_tags(input_text)
    cleaned = tokenize(cleaned)
    cleaned = remove_stop_words(cleaned)
    return cleaned

def hypernym_list(tokens):
    hypernym_tokens = []
    for word in tokens:
        My_sysn = wordnet.synsets(word)
        if len(My_sysn) == 0:
            hypernym_tokens.append(word)
        else:
            hypernym_tokens.append(My_sysn[0].lemma_names()[0])
    return hypernym_tokens

In [5]:
tweets1_atlstm['tokenized'] = tweets1_atlstm['text'].apply(lambda x: all_at_once(x))
tweets2_atlstm['tokenized'] = tweets2_atlstm['text'].apply(lambda x: all_at_once(x))
print("tokenized")
tweets1_atlstm['hypernym'] = tweets1_atlstm['tokenized'].apply(lambda x: hypernym_list(x))
print("hypernymed 1...")
tweets2_atlstm['hypernym'] = tweets2_atlstm['tokenized'].apply(lambda x: hypernym_list(x))
print("hypernymed 2")
display(tweets1_atlstm)

  


tokenized
hypernymed 1...
hypernymed 2


Unnamed: 0,text,sentiment,tokenized,hypernym
0,Sooo SAD I will miss you here in San Diego!!!,negative,"[sooo, sad, miss, san, diego]","[sooo, sad, girl, san, diego]"
1,my boss is bullying me...,negative,"[boss, bullying]","[foreman, bullying]"
2,what interview! leave me alone,negative,"[interview, leave, alone]","[interview, leave, alone]"
3,"Sons of ****, why couldn`t they put them on t...",negative,"[sons, put, releases, already, bought]","[son, put_option, release, already, buy]"
4,2am feedings for the baby are fun when he is a...,positive,"[feedings, baby, fun, smiles, coos]","[eating, baby, fun, smile, coo]"
...,...,...,...,...
16358,enjoy ur night,positive,"[enjoy, ur, night]","[enjoy, Ur, night]"
16359,wish we could come see u on Denver husband l...,negative,"[wish, could, come, see, u, denver, husband, l...","[wish, could, semen, see, uracil, Denver, husb..."
16360,I`ve wondered about rake to. The client has ...,negative,"[wondered, rake, client, made, clear, net, for...","[wonder, rake, client, make, clear, internet, ..."
16361,Yay good for both of you. Enjoy the break - y...,positive,"[yay, good, enjoy, break, probably, need, hect...","[Yay, good, enjoy, interruption, probably, nee..."


In [6]:
def process_dataframe(df, tokenized_csv_file, hypernyms_csv_file):
    # Convert sentiment from letter to number
    df['sentiment'] = df['sentiment'].apply(lambda x: 1 if x == 'negative' else 10)
    
    # Write sentiment and tokenized columns to CSV
    with open(tokenized_csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_NONE, escapechar=' ')
        for index, row in df.iterrows():
            sentiment = str(row['sentiment'])
            tokenized = ','.join(row['tokenized'])
            writer.writerow([sentiment, tokenized])
    
    # Write hypernyms column to CSV
    with open(hypernyms_csv_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, quoting=csv.QUOTE_NONE, escapechar=' ')
        for index, row in df.iterrows():
            hypernyms = ','.join(row['hypernym'])
            if not hypernyms:
                print(f"Empty hypernyms list encountered at index {index}")
                print(row)
            writer.writerow([hypernyms])


In [7]:
nullcount1 = tweets1_atlstm.isnull().sum()
print(nullcount1)

text         0
sentiment    0
tokenized    0
hypernym     0
dtype: int64


In [8]:
tweets1_atlstm = tweets1_atlstm[(tweets1_atlstm['tokenized'].apply(len) > 0) & (tweets1_atlstm['hypernym'].apply(len) > 0)]
tweets2_atlstm = tweets2_atlstm[(tweets2_atlstm['tokenized'].apply(len) > 0) & (tweets2_atlstm['hypernym'].apply(len) > 0)]

process_dataframe(tweets1_atlstm,  "datasets/small_tweets_bilstm.csv", "datasets/small_tweets_ANN.csv")
process_dataframe(tweets2_atlstm, "datasets/big_tweets_bilstm.csv", "datasets/big_tweets_ANN.csv")