In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [2]:
# Load data
df = pd.read_csv("2021q3_fin_sec_data.csv")

df.drop(columns=['sic'], inplace=True)

# One Hot encode name and cityba
df = pd.concat([df,pd.get_dummies(df['name'])], axis=1).drop(['name'],axis=1)
df = pd.concat([df,pd.get_dummies(df['cityba'])], axis=1).drop(['cityba'],axis=1)

df.head()

Unnamed: 0,afs,Assets,AssetsCurrent,CashAndCashEquivalentsAtCarryingValue,CashCashEquivalentsRestrictedCashAndRestrictedCashEquivalentsPeriodIncreaseDecreaseIncludingExchangeRateEffect,CommonStockParOrStatedValuePerShare,CommonStockValue,IncomeTaxExpenseBenefit,Liabilities,LiabilitiesAndStockholdersEquity,...,yonkers,york,yorktown heights,"yuzhong district, chongqing",zeeland,zephyr cove,zihron ya'akov,zug,zurich,zwijnaarde
0,2,990703000.0,332132000.0,36469000.0,,0.33,8535000.0,3672000.0,,990703000.0,...,0,0,0,0,0,0,0,0,0,0
1,1,12810500000.0,2052600000.0,291800000.0,-131800000.0,0.01,,45400000.0,8469700000.0,12810500000.0,...,0,0,0,0,0,0,0,0,0,0
2,1,55943000000.0,18561000000.0,7759000000.0,136000000.0,0.1,120000000.0,65000000.0,13684000000.0,55943000000.0,...,0,0,0,0,0,0,0,0,0,0
3,4,1104225.0,1100572.0,29194.0,,0.0,77127.0,0.0,6788811.0,1104225.0,...,0,0,0,0,0,0,0,0,0,0
4,4,1131420.0,8081.0,8081.0,,0.0,177714.0,0.0,2632722.0,1131420.0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Extract y-column
y = np.array(df.pop('afs'))

# Convert to numpy
X = np.array(df)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create imputer to replace missing values with the mean
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute data, then train
X_train_imp = imp.transform(X_train)

# Create model
clf = RandomForestClassifier(n_estimators=1000, random_state=0)

# Fit model
clf = clf.fit(X_train_imp, y_train)

# Impute for X_test
X_test_imp = imp.transform(X_test)

# Predict
y_pred2 = clf.predict(X_test_imp)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2, average='macro'))
print("Recall", recall_score(y_test, y_pred2, average='macro'))
print("F1-score", f1_score(y_test, y_pred2, average='micro'))

Accuracy: 0.8477531857813548
Precision: 0.8032784942431004
Recall 0.7202225480118408
F1-score 0.8477531857813547


In [4]:
# Load data
df = pd.read_csv("2021q3_fin_sec_data.csv")

df.drop(columns=['sic'], inplace=True)

# One Hot encode name and cityba
df = pd.concat([df,pd.get_dummies(df['name'])], axis=1).drop(['name'],axis=1)
df = pd.concat([df,pd.get_dummies(df['cityba'])], axis=1).drop(['cityba'],axis=1)

In [5]:
# Extract y-column
y = np.array(df.pop('afs'))

# Convert to numpy
X = np.array(df)

# Standardized data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Create imputer to replace missing values with the mean
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute data, then train
X_train_imp = imp.transform(X_train)

# Create model
clf = RandomForestClassifier(n_estimators=1000, random_state=0)

# Fit model
clf = clf.fit(X_train_imp, y_train)

# Impute for X_test
X_test_imp = imp.transform(X_test)

# Predict
y_pred2 = clf.predict(X_test_imp)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred2))
print("Precision:", precision_score(y_test, y_pred2, average='macro'))
print("Recall", recall_score(y_test, y_pred2, average='macro'))
print("F1-score", f1_score(y_test, y_pred2, average='micro'))

Accuracy: 0.8477531857813548
Precision: 0.7990860990860992
Recall 0.7179664197967747
F1-score 0.8477531857813547
