In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, accuracy_score, classification_report
%matplotlib inline

Testing various machine learning methods and their accuracy scores.

In [3]:
def split_df_to_x_y(df):
    df = df.drop(['District', 'City', 'Date', 'Time',], axis=1)
    labels = df['Country'].unique().tolist()
    mapping = dict(zip(labels, range(len(labels))))
    df.replace({'Country': mapping}, inplace=True)
    df.dropna(inplace=True)
    df.reset_index(drop=True, inplace=True)
    X = df.drop(['Scale'], axis=1)
    y = df['Scale']
    return X,y

In [4]:
def linear_regression_model(X_train,y_train, X_test, y_test):
    model = LinearRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print('linear regression model:')
    print(r2_score(y_test, y_pred))

In [5]:
def random_forest_model(X_train,y_train, X_test, y_test):    
    rf = RandomForestClassifier(criterion='entropy', max_depth=18, n_estimators=72, random_state=31, min_samples_leaf=1, min_samples_split=9, max_features=0.4)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    print('random forest model:')
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    return y_pred

In [6]:
def decision_tree_model(X_train,y_train, X_test, y_test):
    dt = DecisionTreeClassifier(max_depth=5)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    print('decision tree model:')
    print(f'test: {dt.score(X_test, y_test)}\ntrain: {dt.score(X_train, y_train)}\naccuracy: {accuracy_score(y_test, y_pred)}')

In [7]:
df = pd.read_csv('cleaned_data.csv')
X,y = split_df_to_x_y(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
linear_regression_model(X_train,y_train,X_test, y_test)
decision_tree_model(X_train,y_train,X_test, y_test)

linear regression model:
0.40175302974096594
decision tree model:
test: 0.6685721386852244
train: 0.672693944760307
accuracy: 0.6685721386852244


In [8]:
y_pred = random_forest_model(X_train,y_train,X_test, y_test)
res_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
res_df['Correct'] = res_df.Actual == res_df.Predicted
res_df['Correct'] = res_df.Correct.astype(int)
res_df

random forest model:
0.7016279358767242
              precision    recall  f1-score   support

        -1.0       0.84      0.43      0.57       123
         0.0       0.80      0.87      0.83      4486
         1.0       0.57      0.59      0.58      2407
         2.0       0.45      0.26      0.33       734
         3.0       0.45      0.29      0.35       225
         4.0       0.71      0.15      0.25        66
         5.0       0.25      0.17      0.20         6

    accuracy                           0.70      8047
   macro avg       0.58      0.39      0.44      8047
weighted avg       0.69      0.70      0.69      8047



Unnamed: 0,Actual,Predicted,Correct
6388,0.0,0.0,1
967,0.0,0.0,1
7615,0.0,0.0,1
39943,2.0,1.0,0
23213,0.0,0.0,1
...,...,...,...
30536,0.0,0.0,1
14854,3.0,1.0,0
23082,0.0,0.0,1
23677,1.0,0.0,0


In [10]:
final_df = pd.concat([X_test, res_df], axis=1)
final_df

Unnamed: 0,Country,Longtitude,Latitude,Length (KM),Width (M),Deaths,Year,Actual,Predicted,Correct
6388,7,-79.3800,34.2000,6.44,9.14,0.0,1996.0,0.0,0.0,1
967,15,-96.8300,37.8300,0.16,9.14,0.0,1990.0,0.0,0.0,1
7615,22,-104.2800,36.7800,0.16,45.72,0.0,1997.0,0.0,0.0,1
39943,16,-92.5544,42.6869,15.69,384.05,0.0,2021.0,2.0,1.0,0
23213,3,-86.4719,31.1399,3.04,45.72,0.0,2008.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...
30536,16,-96.0000,43.2700,0.48,45.72,0.0,2014.0,0.0,0.0,1
14854,14,-86.5800,35.2500,2.57,640.08,0.0,2003.0,3.0,1.0,0
23082,26,-87.3550,40.4991,0.10,9.14,0.0,2008.0,0.0,0.0,1
23677,15,-100.8670,37.8262,6.92,301.75,0.0,2009.0,1.0,0.0,0
