### SJSU ChatGPT Tweets Analysis

In [None]:
#import libraries

import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('chatgpt1.csv')
df.head()

In [None]:
df.shape

In [None]:
df.duplicated().sum()

#no duplicates values found

In [None]:
df.columns.values.tolist()

In [None]:
df.info()

In [None]:
df.isnull().sum()

#Outlinks, Countlinks, Media, QuotedTweet, MentionedUsers have null values
#These null values will not be imputed as some tweets might not have outlinks, media, mentioned users, etc.

### Data Preprocessing

In [None]:
df1 = df.copy()

In [None]:
df1.head()

In [None]:
#renaming column names

In [None]:
df1.columns = [x.lower() for x in df1.columns]

In [None]:
df1.columns

In [None]:
df1.rename(columns = {'tweet Id' : 'tweet_id', 'text' : 'tweet', 'permalink' : 'tweet_link', 'user' : 'user_handle', 
                      'countlinks' : 'mentioned_link', 'replycount' : 'reply_count', 'retweetcount' : 'retweet_count', 
                      'likecount' : 'like_count', 'quotecount' : 'quote_count', 'conversationid' : 'conversation_id', 
                      'source' : 'source_link', 'media' : 'media_present', 'mentionedusers' : 'mentioned_users'}, inplace = True)

In [None]:
df1.columns

In [None]:
#countlinks alias or short link for outlinks
#dropping column outlinks

In [None]:
df1.drop('outlinks', axis=1, inplace = True)

In [None]:
#dropping languages other than English

In [None]:
df1.drop(df1.loc[df1['language'] != 'en'].index, inplace = True)

In [None]:
df1.shape

In [None]:
#ASCII cannot support complex characters that make up other language
#Dropping languages other than English

In [None]:
#Removing emojis
#Emojis are encoded in the Unicode Standard

In [None]:
df1 = df1.astype(str).apply(lambda x: x.str.encode(encoding = 'ascii', errors = 'ignore').str.decode('ascii'))

In [None]:
df1.head()

In [None]:
#Splitting Datetime attribute to date and time
#Storing Date as a new attribute

In [None]:
date_time_split = df1['datetime'].str.split(' ', n = 1, expand = True)

In [None]:
df1['date'] = date_time_split[0]

In [None]:
df1.drop('datetime', axis=1, inplace = True)

In [None]:
df1['date'].unique()

#3 uniue date values

In [None]:
df1['source_link'].unique()

#3 uniue date values

In [None]:
#drop source
df.drop('source_link', axis=1, inplace=True)

In [None]:
#replace empty square brackets in hashtag column with Null

In [None]:
df1['hashtag'].replace(to_replace=['[]'], value='Null', inplace=True)

In [None]:
#change nan to Null in 'Outlinks', 'Countlinks', 'Media', 'QuotedTweet', 'MentionedUsers'

In [None]:
df1['mentioned_link'].replace(to_replace=['nan'], value='Null', inplace=True)

In [None]:
df1['media_present'].replace(to_replace=['nan'], value='Null', inplace=True)

In [None]:
df1['quotedtweet'].replace(to_replace=['nan'], value='Null', inplace=True)

In [None]:
df1['mentioned_users'].replace(to_replace=['nan'], value='Null', inplace=True)

In [None]:
#Normalisation of text column

In [None]:
def clean_text(X):
    X = X.split()
    X_new = [x for x in X if not x.startswith('https://')]
    return ' '.join(X_new)

df1['tweet'] = df1['tweet'].apply(clean_text)

In [None]:
def clean_text(X):
    X = X.split()
    X_new = [x for x in X if not x.startswith('@')]
    return ' '.join(X_new)

df1['tweet'] = df1['tweet'].apply(clean_text)

In [None]:
def clean_text(X):
    X = X.split()
    X_new = [x for x in X if not x.startswith('#')]
    return ' '.join(X_new)

df1['tweet'] = df1['tweet'].apply(clean_text)

In [None]:
df1.head()

In [None]:
#cleanning columns with links

In [None]:
df1['mentioned_link'] = df1['mentioned_link'].map(lambda x: x.lstrip("['"))
df1['mentioned_link'] = df1['mentioned_link'].map(lambda x: x.rstrip("']"))

In [None]:
df1['source_link'] = df1['source_link'].map(lambda x: x.lstrip('<a href="'))
df1['source_link'] = df1['source_link'].map(lambda x: x.rstrip('a>'))

In [None]:
#check mentioned_users column

In [None]:
#check hashtag column

In [None]:
#split into 4 different tables

In [None]:
#write to csv

In [None]:
df1.to_csv('chatgpt_e.csv', encoding='utf-8', index=False, header=True)

In [None]:
#check without encoding

df1.to_csv('chatgpt_we.csv', index=False, header=True)