In [31]:
import pandas as pd

In [32]:
data_1 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML For Good/CRISIS_Adult_April_2020.csv', low_memory=False)
data_2 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML For Good/CRISIS_Adult_April_2021.csv', low_memory=False)
data_3 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML For Good/CRISIS_Adult_May_2020.csv', low_memory=False)
data_4 = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/ML For Good/CRISIS_Adult_November_2020.csv', low_memory=False)

In [33]:
# Intersection to extract features which is common to all 4 dataset
common_features = list(set.intersection(*map(set,[data_1, data_2, data_3, data_4])))

In [34]:
# Concatenating 4 dataset
data = pd.concat([data_1[common_features], data_2[common_features], data_3[common_features], data_4[common_features]])

In [35]:
data.shape

(10364, 68)

In [36]:
# Removing column which is having Null value ration > 0.5
col_applicable = []
for col in data.columns:
    ratio = data[col].isnull().sum() / data.shape[0]
    if ratio < 0.5:
        col_applicable.append(col)

In [37]:
# Filtering Based on Replacable values
data_filtered = data[col_applicable]

In [38]:
# Removing some additional column which may not be necessary
col_to_remove = ['timestamp1', 'country', 'age', 'ID', 'timestamp2']
df_final = data_filtered.drop(col_to_remove, axis = 1)

In [39]:
# Checking NULL Values
df_final.isnull().sum()

exposed___3                  0
hoursofsleepweekends       191
financedifficulty          166
diagnosedfamily___1          0
disruptedsupports___11       0
disruptedsupports___9        0
livingdifficulty           174
impact___4                   0
hoursofsleepweekdays       199
inpersonconvo_bin          269
worriedphysical            151
disruptedsupports___7        0
disruptedsupports___2        0
symptoms___7                 0
familychangestress         180
disruptedsupports___6        0
symptoms___2                 0
positivechange             118
exposed___2                  0
disruptedsupports___4        0
contactschanged            165
disruptedsupports___3        0
disruptedsupports___12       0
exposed___1                  0
difficultycancellations    172
impact___7                   0
disruptedsupports___5        0
diagnosedfamily___3          0
symptoms___3                 0
impact___8                   0
disruptedsupports___1        0
symptoms___6                 0
inperson

In [47]:
# Imputing missing values with Forward Fill Method
for col in df_final.columns:
    if df_final[col].isnull().sum() != 0:
        df_final[col] = df_final[col].fillna(method = 'ffill')

In [49]:
# Encoding Label for age_bin
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(df_final['age_bin'])
df_final['age_bin'] = le.transform(df_final['age_bin'])

In [50]:
df_final.shape

(10364, 60)

In [53]:
# Defining Train & Target Data
X = df_final.drop('worriedmental', axis = 1)
y = df_final['worriedmental']

In [91]:
target_dictionary = {1 : 'Not at all', 2: 'Slightly', 3: 'Moderately', 4: 'Very', 5: 'Extremely'}
target_dictionary.values()

dict_values(['Not at all', 'Slightly', 'Moderately', 'Very', 'Extremely'])

In [63]:
# Encoding Traget data
le_target = LabelEncoder()
le_target.fit(y)
y = le_target.transform(y)

In [58]:
# Encoding Data with Dummie Values
col_to_encode = [col for col in X.columns if '___' not in col]
X_final = pd.get_dummies(X, columns=col_to_encode, drop_first=True)

In [64]:
# Splitting data into train test with ration 80:20
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size = 0.2, random_state = 42, stratify = y)

In [76]:
# Using Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
re = RandomForestClassifier(n_estimators=1000, max_depth=300)
re.fit(X_train, y_train)
re.score(X_test, y_test)

0.48480463096960924

In [92]:
# Macro & Weighted Average
from sklearn.metrics import classification_report
print(classification_report(y_test, re.predict(X_test), target_names=target_dictionary.values()))

              precision    recall  f1-score   support

  Not at all       0.68      0.61      0.64       454
    Slightly       0.44      0.48      0.46       522
  Moderately       0.41      0.45      0.43       526
        Very       0.41      0.38      0.39       356
   Extremely       0.57      0.49      0.53       215

    accuracy                           0.48      2073
   macro avg       0.50      0.48      0.49      2073
weighted avg       0.49      0.48      0.49      2073



In [77]:
# Getting Feature Importance
importance = pd.DataFrame()
importance['Feature_Name'] = re.feature_names_in_
importance['Feature_Value'] = re.feature_importances_
importance.sort_values(by='Feature_Value', ascending=False).iloc[:10, :]

Unnamed: 0,Feature_Name,Feature_Value
53,worriedphysical_3.0,0.018871
52,worriedphysical_2.0,0.016892
55,worriedphysical_5.0,0.016869
117,restrictionsstress_3.0,0.015254
54,worriedphysical_4.0,0.014795
121,age_bin_2,0.013899
126,worriedothers_5.0,0.013849
147,difficultydistancing_2.0,0.013298
139,timeoutside_2.0,0.01283
133,readingtalking_4.0,0.012774
