In [1]:
import numpy as np
import pandas as pd
import json
import re
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn import metrics

from imblearn.over_sampling import SMOTEN
from sklearn.model_selection import train_test_split 

In [2]:
user_dataset = pd.read_csv(
    './labeled_users.csv',
    sep=',', 
    encoding='latin-1'
    )
with open('./Twitter_User_Handles_labeled_tweets.json') as file:
    dic = json.load(file)

# converting json dataset from dictionary to dataframe
tweets_data = pd.DataFrame.from_dict(dic, orient='index')
tweets_data.reset_index(level=0, inplace=True)
tweets_data['text'] = tweets_data[tweets_data.columns[1:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)
tweets_data = pd.DataFrame({'user_id':tweets_data['index'].astype(int), 'text':tweets_data['text']})
tweets_data.head()



Unnamed: 0,user_id,text
0,12488,"YKAR, a futuristic sans serif font by @Emmeran..."
1,719703,"In other words, it’s good news about the vacci..."
2,749003,would it be fair to call lil nas x the first s...
3,822540,@Asmongold ❤️ 🙏 @Hunter4J @Wario64 bonk @FF_XI...
4,865071,@robo_james How about pizza dipped in water 🤦🏻...


In [3]:
def preprocess(data_set):
    t = data_set
    t = re.sub(r'https?://\S+','',t)
    t = re.sub(r"^#\S+|\s#\S+",'',t)
    t = re.sub(r"^@\S+|\s@\S+",'', t)
    t = t.lower()
    return t

In [4]:
tweets_data['text'] = tweets_data['text'].apply(preprocess)
tweets_data.head()

Unnamed: 0,user_id,text
0,12488,"ykar, a futuristic sans serif font by - who c..."
1,719703,"in other words, it’s good news about the vacci..."
2,749003,would it be fair to call lil nas x the first s...
3,822540,❤️ 🙏 bonk nice ed mcboy oos. getting real tir...
4,865071,how about pizza dipped in water 🤦🏻‍♂️ day 21 ...


In [5]:
dataset = pd.merge(user_dataset, tweets_data, on='user_id', how='inner')
dataset = dataset[dataset.race != 5]
dataset.dropna(subset=['race'], inplace=True)


In [6]:
num_data = len(dataset)
per_african = len(dataset[dataset['race']==1.0])/num_data
per_latino = len(dataset[dataset['race']==2.0])/num_data
per_asian = len(dataset[dataset['race']==3.0])/num_data
per_white = len(dataset[dataset['race']==4.0])/num_data

print(f'Number of data points: {num_data}')
print('Percentages')
print(f'African: {per_african}')
print(f'Latino: {per_latino}')
print(f'Asian: {per_asian}')
print(f'White: {per_white}')

Number of data points: 3126
Percentages
African: 0.09564939219449776
Latino: 0.05950095969289827
Asian: 0.03262955854126679
White: 0.8122200895713372


In [7]:
tfidf = TfidfVectorizer(stop_words=None)
tfidf.fit(dataset['text'].values)
X = tfidf.transform(dataset['text'].values)
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=1000, random_state=42)
X = svd.fit_transform(X)
y = dataset['race'].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33,random_state=42)

# from imblearn.under_sampling import RandomUnderSampler
# sampling = {4:300}

# rds = RandomUnderSampler(random_state=42,sampling_strategy=sampling)
# X_SMOTE, y_SMOTE = rds.fit_resample(X_train,y_train)

from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=777,k_neighbors=1)
X_SMOTE, y_SMOTE = smote.fit_resample(X_train, y_train)



In [9]:
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
model = linear_model.LogisticRegression()
model.fit(X_SMOTE, y_SMOTE)
y_pred = model.predict(X_test)
metrics.accuracy_score(
    y_test, 
    y_pred)
    
print(metrics.classification_report(
    y_true=y_test,
    y_pred=y_pred)
    )

              precision    recall  f1-score   support

         1.0       0.45      0.56      0.50       104
         2.0       0.19      0.21      0.20        61
         3.0       0.15      0.13      0.14        38
         4.0       0.88      0.85      0.86       829

    accuracy                           0.76      1032
   macro avg       0.42      0.44      0.43      1032
weighted avg       0.77      0.76      0.76      1032



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
from sklearn.model_selection import KFold, cross_val_score
cv = KFold(n_splits=5, random_state=1, shuffle=True)
# create model
model = linear_model.LogisticRegression()
# evaluate model
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
for i in range(5):
    print(f'Fold {i}: %.3f' % scores[i])
print('Average accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))


Fold 0: 0.804
Fold 1: 0.813
Fold 2: 0.808
Fold 3: 0.826
Fold 4: 0.845
Average accuracy: 0.819 (0.015)
