In [1]:
import re
import numpy as np
import pandas as pd
import langdetect
from tqdm import tqdm
from nltk.corpus import stopwords
from langdetect import detect
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xuenichen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/xuenichen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/xuenichen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
autism_file_path = '/Users/xuenichen/Desktop/BEF_Chen/dataset/Twitter Autism/autism.csv'
control_file_path = '/Users/xuenichen/Desktop/BEF_Chen/dataset/Twitter Autism/control_group.csv'

# Load the datasets
autism_df = pd.read_csv(autism_file_path)
control_df = pd.read_csv(control_file_path)

In [4]:
print("New total number of rows:", autism_df.shape[0])
print("New total number of rows:", control_df.shape[0])

New total number of rows: 3137952
New total number of rows: 3377518


In [5]:
# Drop rows where 'User_ID' is NaN in autism_df
autism_df = autism_df.dropna(subset=['User_ID'])

# Drop rows where 'User_ID' is NaN in control_df
control_df = control_df.dropna(subset=['User_ID'])

In [6]:
control_df = control_df.drop(
    columns=['Source', 'Account created', 'Profile description'])
autism_df = autism_df.drop(
    columns=['Source', 'Account created', 'Profile description'])
autism_df.head()
control_df.head()

Unnamed: 0,User_ID,Friends count,Followers count,Tweet date,Tweet id,Language,Tweet text,Hashtags,Location,Reply count,Retweet count,Like count
0,b0b63865cbb84efcd5422e42d13c8672707c82185c2cdf...,115,1175,2020-12-31 23:59:58+00:00,1344795463317389312,en,"People come and go, yet they remain irreplacea...",,,0,0,0
1,978a4548ee471aa84938dd67052799fa586d45fe405517...,2955,3053,2016-03-05 23:59:54+00:00,706268029634613248,en,"Wind 1.2 mph WNW. Barometer 1013.4 hPa, Rising...",,"Dorset, England",0,0,0
2,5fbafd883c78ace9e30bccaa0caa350e797c625579aeba...,207,9938,2014-08-12 23:59:55+00:00,499344560506433536,en,☆*:｡*:*｡:*☆\nluke hemmings &amp; michael cliff...,,,0,1,0
3,b8554e9a8fcc0eaf7485bfeb3ecc0d64373982d33f1660...,113,195,2021-03-04 23:59:53+00:00,1367625880030257155,en,I have a mud mask on and my Face ID won’t work 😂,,,0,0,0
4,979462d73dec55949483575709190ed03c0e9b79bbc7f6...,605,4912,2019-03-30 23:59:35+00:00,1112142359138050048,ht,voy a dormir 3h ****,,icon @hanavbara,0,0,1


In [7]:
print("New total number of rows:", autism_df.shape[0])
print("New total number of rows:", control_df.shape[0])

New total number of rows: 3137952
New total number of rows: 3377518


In [8]:
# Drop rows that have missing values in the 'Hashtags' column
control_df = control_df.dropna(subset=['Hashtags']).copy()
autism_df = autism_df.dropna(subset=['Hashtags']).copy()

In [9]:
print("New total number of rows:", control_df.shape[0])
print("New total number of rows:", autism_df.shape[0])

New total number of rows: 357015
New total number of rows: 976592


In [10]:
control_df_ratio = control_df['Language'].value_counts(normalize=True)
print("Language ratio before dropping non-English rows:\n", control_df_ratio)

Language ratio before dropping non-English rows:
 Language
en     0.989040
es     0.002944
tl     0.002101
fr     0.001605
pt     0.001090
in     0.000692
nl     0.000431
ht     0.000398
it     0.000286
de     0.000238
et     0.000207
vi     0.000151
sv     0.000115
no     0.000112
da     0.000109
pl     0.000098
tr     0.000095
eu     0.000087
cy     0.000048
ca     0.000045
qme    0.000034
lv     0.000025
und    0.000011
hi     0.000008
lt     0.000008
hu     0.000008
is     0.000006
cs     0.000006
ja     0.000003
Name: proportion, dtype: float64


In [11]:
autism_df_ratio = autism_df['Language'].value_counts(normalize=True)
print("Language ratio before dropping non-English rows:\n", autism_df_ratio)

Language ratio before dropping non-English rows:
 Language
en     0.998064
fr     0.000620
de     0.000191
nl     0.000186
tl     0.000167
es     0.000152
ht     0.000081
in     0.000073
pt     0.000072
et     0.000069
da     0.000065
no     0.000059
sv     0.000034
it     0.000034
tr     0.000032
ca     0.000018
cy     0.000018
vi     0.000013
pl     0.000010
lv     0.000010
eu     0.000010
is     0.000007
cs     0.000005
und    0.000003
hi     0.000002
lt     0.000002
qme    0.000002
hu     0.000001
Name: proportion, dtype: float64


In [12]:
control_df = control_df[control_df['Language'] == 'en']
autism_df = autism_df[autism_df['Language'] == 'en']

In [13]:
print("New total number of rows:", autism_df.shape[0])
print("New total number of rows:", control_df.shape[0])

New total number of rows: 974701
New total number of rows: 353102


# clean the tweet texts

In [14]:
# Function to clean tweet text
def clean_tweet_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#', '', text)
    # Remove HTML entities
    text = re.sub(r'&[a-z]+;', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    # Rejoin words
    clean_text = ' '.join(lemmatized_text)
    return clean_text

In [15]:
# Apply the cleaning function to the 'Tweet text' column
control_df['Tweet text'] = control_df['Tweet text'].apply(clean_tweet_text)

autism_df['Tweet text'] = autism_df['Tweet text'].apply(clean_tweet_text)

In [16]:
control_df = control_df.drop(columns=['Language'])
autism_df = autism_df.drop(columns=['Language'])

In [17]:
# Initialize the pipeline
tokenizer = AutoTokenizer.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment")
model = AutoModelForSequenceClassification.from_pretrained(
    "cardiffnlp/twitter-roberta-base-sentiment")
sentiment_pipeline = pipeline(
    "sentiment-analysis", model=model, tokenizer=tokenizer)


# Function to convert sentiment analysis output to continuous scale, with rounding
def get_continuous_sentiment(text, pipeline):
    result = pipeline(text)[0]
    # The output is a dictionary with 'label' and 'score'.
    # Labels correspond to 'LABEL_0' (negative), 'LABEL_1' (neutral), 'LABEL_2' (positive)
    label = result['label']
    if label == 'LABEL_2':  # Positive
        return 2
    elif label == 'LABEL_0':  # Negative
        return -2
    else:  # Neutral
        return 0

In [18]:
def safe_get_continuous_sentiment(text, sentiment_pipeline):
    # Check if text is a non-empty string and not NaN
    if pd.notnull(text) and text.strip():
        return get_continuous_sentiment(text, sentiment_pipeline)
    else:
        return 0  # Default sentiment value for invalid inputs

In [19]:
tqdm.pandas()
control_df['Tweet Text Sentiment'] = control_df['Tweet text'].progress_apply(
    lambda x: safe_get_continuous_sentiment(x, sentiment_pipeline))
autism_df['Tweet Text Sentiment'] = autism_df['Tweet text'].progress_apply(
    lambda x: safe_get_continuous_sentiment(x, sentiment_pipeline))

100%|██████████| 353102/353102 [4:16:12<00:00, 22.97it/s]  
100%|██████████| 974701/974701 [12:03:12<00:00, 22.46it/s]   


In [20]:
# Save the modified DataFrame
control_df.to_csv(
    '/Users/xuenichen/Desktop/BEF_Chen/dataset/control_tweets_with_sentiment-all.csv', index=False)

print("Sentiment analysis with continuous scaling completed.")


autism_df.to_csv(
    '/Users/xuenichen/Desktop/BEF_Chen/dataset/austim_tweets_with_sentiment-all.csv', index=False)

print("Sentiment analysis with continuous scaling completed.")

Sentiment analysis with continuous scaling completed.
Sentiment analysis with continuous scaling completed.


# save the files with dropping rows with unique tweets, as no intractions

In [21]:
tweet_id_frequencies = control_df['Tweet id'].value_counts()

# Step 2: Filter to keep only rows with 'Tweet id' that appear more than once
control_df_inter = control_df[control_df['Tweet id'].map(tweet_id_frequencies) > 1]

tweet_id_frequencies = autism_df['Tweet id'].value_counts()

# Step 2: Filter to keep only rows with 'Tweet id' that appear more than once
autism_df_inter = autism_df[autism_df['Tweet id'].map(tweet_id_frequencies) > 1]

In [22]:
print("New total number of rows:", autism_df_inter.shape[0])
print("New total number of rows:", control_df_inter.shape[0])

New total number of rows: 30
New total number of rows: 1978


In [23]:
# Save the modified DataFrame
control_df_inter.to_csv(
    '/Users/xuenichen/Desktop/BEF_Chen/dataset/control_tweets_with_sentiment-inter.csv', index=False)

print("Sentiment analysis with continuous scaling completed.")


autism_df_inter.to_csv(
    '/Users/xuenichen/Desktop/BEF_Chen/dataset/austim_tweets_with_sentiment-inter.csv', index=False)

print("Sentiment analysis with continuous scaling completed.")

Sentiment analysis with continuous scaling completed.
Sentiment analysis with continuous scaling completed.


# save files with dropping rows with no value for 'reply count'& 'retweet cont'+  drop rows with no value for 'friend count'& 'follower cont'

In [24]:
# Filter out rows where 'Reply count', 'Retweet count' are all 0
control_df_act = control_df[((control_df['Reply count'].notna() & control_df['Reply count'] != 0) |
                         (control_df['Retweet count'].notna() & control_df['Retweet count'] != 0))]

# Filter out rows where 'Friends count', 'Followers count' are both 0
control_df_act = control_df[((control_df['Friends count'].notna() & control_df['Friends count'] != 0) |
                         (control_df['Followers count'].notna() & control_df['Followers count'] != 0))]

In [25]:
# Filter out rows where 'Reply count', 'Retweet count' are all 0
autism_df_act = autism_df[((autism_df['Reply count'].notna() & autism_df['Reply count'] != 0) |
                       (autism_df['Retweet count'].notna() & autism_df['Retweet count'] != 0))]

# Filter out rows where 'Friends count', 'Followers count' are both 0
autism_df_act = autism_df[((autism_df['Friends count'].notna() & autism_df['Friends count'] != 0) |
                       (autism_df['Followers count'].notna() & autism_df['Followers count'] != 0))]

In [26]:
print("New total number of rows:", autism_df_act.shape[0])
print("New total number of rows:", control_df_act.shape[0])

New total number of rows: 724418
New total number of rows: 267897


In [None]:
# Remove duplicates based on 'Tweet text'
autism_df_act = autism_df_act.drop_duplicates(subset=['Tweet text'])
control_df_act = control_df_act.drop_duplicates(subset=['Tweet text'])