In [1]:
import time
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [2]:
columns=['Target','IDs','Date','Flag','User','Text']

with open("twitter_new.csv",
          'r', encoding='utf-8', errors='ignore') as file:
    data=pd.read_csv(file, header=None, names=columns)

data['Weekday']=data['Date'].apply(lambda x: x.split()[0])
data['Month']=data['Date'].apply(lambda x: x.split()[1])
data['Day']=data['Date'].apply(lambda x: x.split()[2])
data['Time']=data['Date'].apply(lambda x: x.split()[3])
data['Timezone']=data['Date'].apply(lambda x: x.split()[4])
data['Year']=data['Date'].apply(lambda x: x.split()[5])

data=data.drop('Date', axis=1)

#Find total unique values
cols=data.columns
for col in cols:
    print('Total Unique ['+ col + '] = ' + str(len(data[col].unique())))

Total Unique [Target] = 2
Total Unique [IDs] = 1598315
Total Unique [Flag] = 1
Total Unique [User] = 659775
Total Unique [Text] = 1581466
Total Unique [Weekday] = 7
Total Unique [Month] = 3
Total Unique [Day] = 29
Total Unique [Time] = 86386
Total Unique [Timezone] = 1
Total Unique [Year] = 1


In [3]:
#Below three columns only have 1 unique value
data=data.drop('Flag', axis=1)
data=data.drop('Timezone', axis=1)
data=data.drop('Year', axis=1)

#Convert time to seconds
def time_to_sec(time):
    H,M,S=time.split(':')
    return (int(H)*3600+int(M)*60+int(S))

data['Time']=data['Time'].apply(time_to_sec)
data['Day']=data['Day'].apply(lambda x: int(x))

#User and User IDs are not relevant to sentiment analysis so we drop them
data=data.drop('User', axis=1)
data=data.drop('IDs', axis=1)

#Find total unique values
cols=data.columns
for col in cols:
    print('Total Unique ['+ col + '] = ' + str(len(data[col].unique())))

Total Unique [Target] = 2
Total Unique [Text] = 1581466
Total Unique [Weekday] = 7
Total Unique [Month] = 3
Total Unique [Day] = 29
Total Unique [Time] = 86386


In [5]:
#Now we clean the text and process it
def text_cleaning(input_string):
    pattern = r'https?://\S+|www\.\S+|\S+\.([a-z]{2,})\b'
    cleaned_string = re.sub(pattern, '', input_string)
    cleaned_string = re.sub(r'[^\w\s]', '', cleaned_string)
    cleaned_string = re.sub(r'\s+', ' ', cleaned_string)
    pattern = r'\b\d+\b'
    cleaned_string = re.sub(pattern, '', cleaned_string)
    cleaned_string = re.sub(r'\d+', '', cleaned_string)
    return cleaned_string.strip()

data['Text']=data['Text'].apply(text_cleaning)

#Remove all the stopwords
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

stemmer = PorterStemmer()

def remove_stopwords_and_stemming(input_string):
    words=nltk.word_tokenize(input_string)
    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.lower() not in stop_words]
    words = [stemmer.stem(word) for word in words]
    cleaned_string = ' '.join(words)
    return cleaned_string

data['Text']=data['Text'].apply(remove_stopwords_and_stemming)
data=data[data['Text']!='']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1599098 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   Target   1599098 non-null  int64 
 1   Text     1599098 non-null  object
 2   Weekday  1599098 non-null  object
 3   Month    1599098 non-null  object
 4   Day      1599098 non-null  int64 
 5   Time     1599098 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 85.4+ MB


In [7]:
data.to_csv('Cleaned_Data.csv', index=False)