In [1]:
import re
import numpy as np
import pandas as pd
import langdetect
from tqdm import tqdm
from nltk.corpus import stopwords
from langdetect import detect
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xuenichen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/xuenichen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xuenichen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
autism_file_path = 'dataset/Twitter Autism/autism.csv'
control_file_path = 'dataset/Twitter Autism/control_group.csv'

# Load the datasets
autism_df = pd.read_csv(autism_file_path)
control_df = pd.read_csv(control_file_path)

In [5]:
autism_df.head(30)
control_df.head(30)

Unnamed: 0,User_ID,Profile description,Account created,Friends count,Followers count,Tweet date,Tweet id,Language,Tweet text,Hashtags,Location,Reply count,Retweet count,Like count,Source
0,b0b63865cbb84efcd5422e42d13c8672707c82185c2cdf...,üèàü•ã‚úàÔ∏èüåé,2015-02-15 20:32:04+00:00,115,1175,2020-12-31 23:59:58+00:00,1344795463317389312,en,"People come and go, yet they remain irreplacea...",,,0,0,0,"<a href=""http://twitter.com/download/iphone"" r..."
1,978a4548ee471aa84938dd67052799fa586d45fe405517...,Official Dorset Live Weather twitter page. Liv...,2012-04-16 10:29:52+00:00,2955,3053,2016-03-05 23:59:54+00:00,706268029634613248,en,"Wind 1.2 mph WNW. Barometer 1013.4 hPa, Rising...",,"Dorset, England",0,0,0,"<a href=""http://sandaysoft.com/"" rel=""nofollow..."
2,5fbafd883c78ace9e30bccaa0caa350e797c625579aeba...,If its meant to be it will happen no matter wh...,2013-05-30 01:26:14+00:00,207,9938,2014-08-12 23:59:55+00:00,499344560506433536,en,‚òÜ*:ÔΩ°*:*ÔΩ°:*‚òÜ\nluke hemmings &amp; michael cliff...,,,0,1,0,"<a href=""http://www.twitter.com"" rel=""nofollow..."
3,b8554e9a8fcc0eaf7485bfeb3ecc0d64373982d33f1660...,"Welcome to my darkness, I've been here awhile.",2012-01-18 00:46:24+00:00,113,195,2021-03-04 23:59:53+00:00,1367625880030257155,en,I have a mud mask on and my Face ID won‚Äôt work üòÇ,,,0,0,0,"<a href=""http://twitter.com/download/iphone"" r..."
4,979462d73dec55949483575709190ed03c0e9b79bbc7f6...,don't you know who i think i am\n\nestudios de...,2012-09-29 16:53:40+00:00,605,4912,2019-03-30 23:59:35+00:00,1112142359138050048,ht,voy a dormir 3h ****,,icon @hanavbara,0,0,1,"<a href=""http://twitter.com/download/android"" ..."
5,61f04f5b2f369682cfde1f23559a6c553041e96a77bf88...,we're all just a series of compromises. üè≥Ô∏è‚Äçüåà,2009-06-16 22:07:44+00:00,416,103,2016-08-20 23:59:52+00:00,767149183681691648,en,There is an entire break room why do you have ...,,,0,0,0,"<a href=""http://twitter.com/download/iphone"" r..."
6,bc8aef9b4149add8c079114a3a7de91470561b0bb19e1a...,P I T T ‚Äò19 üíõüíô,2011-04-02 20:12:22+00:00,1314,751,2018-05-04 23:59:36+00:00,992554367910924288,en,"Feel like getting cute, wanna go out fr...",,"Pittsburgh, PA",0,0,0,"<a href=""http://twitter.com/download/iphone"" r..."
7,2c43fd585f819540b582c541722573bcd8ee851a8085bd...,"Win or lose, hit the booz",2011-12-11 04:43:15+00:00,107,72,2014-03-19 23:59:58+00:00,446435946339512321,en,Whats that movie with David Bowie as the bad guy?,,Delta,0,0,0,"<a href=""http://twitter.com/download/iphone"" r..."
8,919780aae89e0c104ec30b4f50a1d2799b5a2a72677065...,billlie world ambassador,2020-11-12 18:37:10+00:00,239,185,2021-03-19 23:59:53+00:00,1373061696558919683,en,senku and tsukasa... this is definitely fruity,,21 she/they,1,0,0,"<a href=""http://twitter.com/download/iphone"" r..."
9,fcfe5e62a660430a90e5fc001d34f3c7b507656cacbf94...,kwf üíç mommy to LJ ü§ç lola and dolph üê∂,2013-06-26 07:39:55+00:00,826,2245,2018-03-30 23:59:50+00:00,979870851922350080,en,Fixing to meet Dolph and I have no clue what I...,,one direction fan account,0,0,15,"<a href=""http://twitter.com/download/iphone"" r..."


In [4]:
# Check for missing values in all columns
missing_values = autism_df.isnull().sum()
print("Missing values per column:\n", missing_values)

total_rows = autism_df.shape[0]
print("Total number of rows:", total_rows)

Missing values per column:
 User_ID                      0
Profile description         33
Account created              0
Friends count                0
Followers count              0
Tweet date                   0
Tweet id                     0
Language                     0
Tweet text                   1
Hashtags               2161360
Location                552315
Reply count                  0
Retweet count                0
Like count                   0
Source                       0
dtype: int64
Total number of rows: 3137952


In [5]:
# Check for missing values in all columns
missing_values = control_df.isnull().sum()
print("Missing values per column:\n", missing_values)

total_rows = control_df.shape[0]
print("Total number of rows:", total_rows)

Missing values per column:
 User_ID                      0
Profile description     477327
Account created              0
Friends count                0
Followers count              0
Tweet date                   0
Tweet id                     0
Language                     0
Tweet text                  14
Hashtags               3020503
Location               1073027
Reply count                  0
Retweet count                0
Like count                   0
Source                       0
dtype: int64
Total number of rows: 3377518


# drop some columns and rows

In [6]:
# Drop rows where 'User_ID' is NaN in autism_df
autism_df = autism_df.dropna(subset=['User_ID'])

# Drop rows where 'User_ID' is NaN in control_df
control_df = control_df.dropna(subset=['User_ID'])

In [7]:
control_df = control_df.drop(columns=['Source', 'Account created',])
autism_df = autism_df.drop(columns=['Source', 'Account created',])
autism_df.head()
control_df.head()

Unnamed: 0,User_ID,Profile description,Friends count,Followers count,Tweet date,Tweet id,Language,Tweet text,Hashtags,Location,Reply count,Retweet count,Like count
0,b0b63865cbb84efcd5422e42d13c8672707c82185c2cdf...,üèàü•ã‚úàÔ∏èüåé,115,1175,2020-12-31 23:59:58+00:00,1344795463317389312,en,"People come and go, yet they remain irreplacea...",,,0,0,0
1,978a4548ee471aa84938dd67052799fa586d45fe405517...,Official Dorset Live Weather twitter page. Liv...,2955,3053,2016-03-05 23:59:54+00:00,706268029634613248,en,"Wind 1.2 mph WNW. Barometer 1013.4 hPa, Rising...",,"Dorset, England",0,0,0
2,5fbafd883c78ace9e30bccaa0caa350e797c625579aeba...,If its meant to be it will happen no matter wh...,207,9938,2014-08-12 23:59:55+00:00,499344560506433536,en,‚òÜ*:ÔΩ°*:*ÔΩ°:*‚òÜ\nluke hemmings &amp; michael cliff...,,,0,1,0
3,b8554e9a8fcc0eaf7485bfeb3ecc0d64373982d33f1660...,"Welcome to my darkness, I've been here awhile.",113,195,2021-03-04 23:59:53+00:00,1367625880030257155,en,I have a mud mask on and my Face ID won‚Äôt work üòÇ,,,0,0,0
4,979462d73dec55949483575709190ed03c0e9b79bbc7f6...,don't you know who i think i am\n\nestudios de...,605,4912,2019-03-30 23:59:35+00:00,1112142359138050048,ht,voy a dormir 3h ****,,icon @hanavbara,0,0,1


## drop rows with no value for 'reply count'& 'retweet cont'
## drop rows with no value for 'friend count'& 'follower cont'

In [8]:
# Filter out rows where 'Reply count', 'Retweet count' are all 0
control_df = control_df[((control_df['Reply count'].notna() & control_df['Reply count'] != 0) |
                         (control_df['Retweet count'].notna() & control_df['Retweet count'] != 0))]

# Filter out rows where 'Friends count', 'Followers count' are both 0
control_df = control_df[((control_df['Friends count'].notna() & control_df['Friends count'] != 0) |
                         (control_df['Followers count'].notna() & control_df['Followers count'] != 0))]

In [9]:
# Filter out rows where 'Reply count', 'Retweet count' are all 0
autism_df = autism_df[((autism_df['Reply count'].notna() & autism_df['Reply count'] != 0) |
                       (autism_df['Retweet count'].notna() & autism_df['Retweet count'] != 0))]

# Filter out rows where 'Friends count', 'Followers count' are both 0
autism_df = autism_df[((autism_df['Friends count'].notna() & autism_df['Friends count'] != 0) |
                       (autism_df['Followers count'].notna() & autism_df['Followers count'] != 0))]

In [10]:
# Remove duplicates based on 'Tweet text'
autism_df = autism_df.drop_duplicates(subset=['Tweet text'])
control_df = control_df.drop_duplicates(subset=['Tweet text'])

In [11]:
print("New total number of rows:", autism_df.shape[0])
print("New total number of rows:", control_df.shape[0])

New total number of rows: 784178
New total number of rows: 525815


## drop rows that has no 'Hashtags'

In [12]:
# Drop rows that have missing values in the 'Hashtags' column
control_df = control_df.dropna(subset=['Hashtags']).copy()
autism_df = autism_df.dropna(subset=['Hashtags']).copy()

In [13]:
print("New total number of rows:", control_df.shape[0])
print("New total number of rows:", autism_df.shape[0])

New total number of rows: 60439
New total number of rows: 265817


## drop rows for non-English

In [14]:
control_df_ratio = control_df['Language'].value_counts(normalize=True)
print("Language ratio before dropping non-English rows:\n", control_df_ratio)

Language ratio before dropping non-English rows:
 Language
en     0.990271
es     0.002813
tl     0.002383
fr     0.001241
pt     0.000811
in     0.000529
nl     0.000414
de     0.000265
eu     0.000232
it     0.000199
sv     0.000149
ht     0.000149
tr     0.000116
et     0.000083
ca     0.000083
da     0.000083
no     0.000066
cy     0.000033
pl     0.000033
lt     0.000017
vi     0.000017
und    0.000017
Name: proportion, dtype: float64


In [15]:
autism_df_ratio = autism_df['Language'].value_counts(normalize=True)
print("Language ratio before dropping non-English rows:\n", autism_df_ratio)

Language ratio before dropping non-English rows:
 Language
en    0.998123
fr    0.000974
de    0.000199
nl    0.000139
tl    0.000109
es    0.000075
ht    0.000056
et    0.000053
tr    0.000038
pt    0.000038
in    0.000038
no    0.000030
it    0.000026
da    0.000023
sv    0.000019
ca    0.000015
vi    0.000011
hi    0.000008
lv    0.000008
eu    0.000008
cy    0.000004
pl    0.000004
is    0.000004
Name: proportion, dtype: float64


In [16]:
control_df = control_df[control_df['Language'] == 'en']
autism_df = autism_df[autism_df['Language'] == 'en']

In [17]:
control_df_ratio = control_df['Language'].value_counts(normalize=True)
print("Language ratio after dropping non-English rows:\n", control_df_ratio)

Language ratio after dropping non-English rows:
 Language
en    1.0
Name: proportion, dtype: float64


In [18]:
autism_df_ratio = autism_df['Language'].value_counts(normalize=True)
print("Language ratio after dropping non-English rows:\n", autism_df_ratio)

Language ratio after dropping non-English rows:
 Language
en    1.0
Name: proportion, dtype: float64


In [19]:
# Check for missing values in all columns
missing_values = control_df.isnull().sum()
print("Missing values per column:\n", missing_values)

total_rows = control_df.shape[0]
print("Total number of rows:", total_rows)

Missing values per column:
 User_ID                    0
Profile description     5221
Friends count              0
Followers count            0
Tweet date                 0
Tweet id                   0
Language                   0
Tweet text                 0
Hashtags                   0
Location               15257
Reply count                0
Retweet count              0
Like count                 0
dtype: int64
Total number of rows: 59851


In [20]:
# Check for missing values in all columns
missing_values = autism_df.isnull().sum()
print("Missing values per column:\n", missing_values)

total_rows = autism_df.shape[0]
print("Total number of rows:", total_rows)

Missing values per column:
 User_ID                    0
Profile description        0
Friends count              0
Followers count            0
Tweet date                 0
Tweet id                   0
Language                   0
Tweet text                 0
Hashtags                   0
Location               47242
Reply count                0
Retweet count              0
Like count                 0
dtype: int64
Total number of rows: 265318


In [21]:
print("New total number of rows:", control_df.shape[0])
print("New total number of rows:", autism_df.shape[0])

New total number of rows: 59851
New total number of rows: 265318


# clean the tweet texts

In [22]:
# Function to clean tweet text
def clean_tweet_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove HTML entities
    text = re.sub(r'&[a-z]+;', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Rejoin words
    clean_text = ' '.join(lemmatized_text)
    return clean_text

In [23]:
# Apply the cleaning function to the 'Tweet text' column
control_df['Tweet text'] = control_df['Tweet text'].apply(clean_tweet_text)

autism_df['Tweet text'] = autism_df['Tweet text'].apply(clean_tweet_text)

# drop meangingless columns

In [24]:
control_df = control_df.drop(columns=['Language'])
autism_df = autism_df.drop(columns=['Language'])

# add sentiment labels

In [25]:
# Initialize the pipeline
tokenizer = AutoTokenizer.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment")
sentiment_pipeline = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer)


# Function to convert sentiment analysis output to continuous scale, with rounding
def get_continuous_sentiment(text, pipeline):
    result = pipeline(text)[0]
    # The output is a dictionary with 'label' and 'score'.
    # Labels correspond to 'LABEL_0' (negative), 'LABEL_1' (neutral), 'LABEL_2' (positive)
    label = result['label']
    if label == 'LABEL_2':  # Positive
        return 2
    elif label == 'LABEL_0':  # Negative
        return -2
    else:  # Neutral
        return 0

In [26]:
def safe_get_continuous_sentiment(text, sentiment_pipeline):
    # Check if text is a non-empty string and not NaN
    if pd.notnull(text) and text.strip():
        return get_continuous_sentiment(text, sentiment_pipeline)
    else:
        return 0  # Default sentiment value for invalid inputs

In [27]:
tqdm.pandas()
control_df['Tweet Text Sentiment'] = control_df['Tweet text'].progress_apply(
    lambda x: safe_get_continuous_sentiment(x, sentiment_pipeline))
autism_df['Tweet Text Sentiment'] = autism_df['Tweet text'].progress_apply(
    lambda x: safe_get_continuous_sentiment(x, sentiment_pipeline))

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 59851/59851 [42:49<00:00, 23.29it/s] 
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 265318/265318 [3:19:06<00:00, 22.21it/s]  


In [28]:
# Save the modified DataFrame
control_df.to_csv(
    'dataset/control_tweets_with_sentiment.csv', index=False)

print("Sentiment analysis with continuous scaling completed.")


autism_df.to_csv(
    'dataset/austim_tweets_with_sentiment.csv', index=False)

print("Sentiment analysis with continuous scaling completed.")

Sentiment analysis with continuous scaling completed.
Sentiment analysis with continuous scaling completed.
