In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_transformer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('train_set.csv')

In [3]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,161528,6,a,6,X,2,gynecology,R,F,4.0,45810,2.0,Urgent,Moderate,2,21-30,2817.0,0-10
1,159472,23,a,6,X,4,gynecology,Q,F,2.0,128565,15.0,Trauma,Moderate,4,51-60,4498.0,21-30
2,309765,2,c,5,Z,2,anesthesia,S,F,3.0,46565,5.0,Urgent,Moderate,2,71-80,4573.0,11-20
3,279614,32,f,9,Y,3,gynecology,S,B,4.0,124546,6.0,Emergency,Moderate,4,11-20,7202.0,51-60
4,147791,14,a,1,X,3,gynecology,S,E,2.0,22729,8.0,Urgent,Moderate,2,51-60,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,237869,12,a,9,Y,3,gynecology,R,B,3.0,82914,3.0,Emergency,Moderate,6,51-60,3966.0,More than 100 Days
99996,254763,28,b,11,X,2,gynecology,R,F,2.0,40026,5.0,Urgent,Moderate,3,21-30,4005.0,51-60
99997,69788,6,a,6,X,3,gynecology,Q,F,3.0,92346,2.0,Trauma,Minor,2,31-40,5215.0,31-40
99998,204442,32,f,9,Y,2,gynecology,S,B,4.0,113798,15.0,Trauma,Moderate,3,41-50,5092.0,11-20


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 18 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   0       100000 non-null  int64  
 1   1       100000 non-null  int64  
 2   2       100000 non-null  object 
 3   3       100000 non-null  int64  
 4   4       100000 non-null  object 
 5   5       100000 non-null  int64  
 6   6       100000 non-null  object 
 7   7       100000 non-null  object 
 8   8       100000 non-null  object 
 9   9       99967 non-null   float64
 10  10      100000 non-null  int64  
 11  11      98517 non-null   float64
 12  12      100000 non-null  object 
 13  13      100000 non-null  object 
 14  14      100000 non-null  int64  
 15  15      100000 non-null  object 
 16  16      100000 non-null  float64
 17  17      100000 non-null  object 
dtypes: float64(3), int64(6), object(9)
memory usage: 13.7+ MB


In [5]:
df.isnull().sum()

0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9       33
10       0
11    1483
12       0
13       0
14       0
15       0
16       0
17       0
dtype: int64

In [6]:
df["9"] = df["9"].fillna(df["9"].mean())                                          #replace NaN values with the mean of their columns
df["11"] = df["11"].fillna(df["11"].mean())

In [7]:
le = LabelEncoder()

In [8]:
cat_col = ['2', '4', '6', '7', '8', '12', '13', '15']         #columns to be encoded
for i in cat_col:
    df[i] = le.fit_transform(df[i])

In [9]:
X = np.array(df[df.columns.difference(['17', '0'])])
y = np.array(df['17'])

In [10]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,161528,6,0,6,0,2,2,2,5,4.0,45810,2.0,2,2,2,2,2817.0,0-10
1,159472,23,0,6,0,4,2,1,5,2.0,128565,15.0,1,2,4,5,4498.0,21-30
2,309765,2,2,5,2,2,1,3,5,3.0,46565,5.0,2,2,2,7,4573.0,11-20
3,279614,32,5,9,1,3,2,3,1,4.0,124546,6.0,0,2,4,1,7202.0,51-60
4,147791,14,0,1,0,3,2,3,4,2.0,22729,8.0,2,2,2,5,3398.0,51-60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,237869,12,0,9,1,3,2,2,1,3.0,82914,3.0,0,2,6,5,3966.0,More than 100 Days
99996,254763,28,1,11,0,2,2,2,5,2.0,40026,5.0,2,2,3,2,4005.0,51-60
99997,69788,6,0,6,0,3,2,1,5,3.0,92346,2.0,1,1,2,3,5215.0,31-40
99998,204442,32,5,9,1,2,2,3,1,4.0,113798,15.0,1,2,3,4,5092.0,11-20


In [11]:
print(X.shape , y.shape)

(100000, 16) (100000,)


In [12]:
df = df[(np.abs(stats.zscore(df.iloc[:,1:17])) < 2.5).all(axis=1)]   #set a top 2,7 for outlayers

In [13]:
def seed_ranker(X, y, seed_range):
    seeds = []
    scores = []
    for seed in range (seed_range):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state=seed)
        lgr = LogisticRegression(n_jobs=-1)
        lgr.fit(X_train, y_train)
        score = lgr.score(X_test,y_test)
        seeds.append(seed)
        scores.append(score)
    scores_df = pd.DataFrame(scores, seeds, columns=['scores'])
    return scores_df.sort_values("scores", ascending=False).head(10)

In [14]:
#seed_ranker(X, y, 250)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.20, random_state=155)

In [16]:
lgr = LogisticRegression()

In [17]:
lgr.fit(X_train, y_train)

LogisticRegression()

In [18]:
lgr.score(X_test,y_test)

0.28185

In [19]:
lgr.fit(X, y)

LogisticRegression()

In [20]:
lgr.score(X, y)


0.27453

In [21]:
to_pred = pd.read_csv("test_set.csv")
to_pred["9"] = to_pred["9"].fillna(to_pred["9"].median())
to_pred["11"] = to_pred["11"].fillna(to_pred["11"].median())

In [22]:
cat_col = ['2', '4', '6', '7', '8', '12', '13', '15']
for i in cat_col:
    to_pred[i] = le.fit_transform(to_pred[i])

In [23]:
X2 = np.array(to_pred[to_pred.columns.difference([ '0'])])

In [24]:
predictions_submit = lgr.predict(np.array(X2))

In [25]:
predictions_submit

array(['21-30', '21-30', '21-30', ..., '21-30', '21-30', '21-30'],
      dtype=object)

In [26]:
to_pred

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,314114,19,0,7,1,4,2,3,2,2.0,59278,8.0,0,1,2,4,4778.0
1,208989,15,2,5,2,3,2,3,5,2.0,102253,15.0,1,2,3,3,5734.0
2,305872,17,4,1,0,4,2,2,4,4.0,5828,4.0,0,1,3,7,5064.0
3,266099,3,2,3,2,4,0,2,0,2.0,56642,9.0,2,0,4,3,3254.0
4,13228,6,0,6,0,4,2,2,5,1.0,116266,8.0,0,1,3,2,4639.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133739,318155,18,3,13,1,6,1,1,1,3.0,123269,25.0,2,1,4,5,5089.0
133740,144850,27,0,7,1,2,2,3,2,2.0,1293,8.0,0,2,3,6,6713.0
133741,180676,16,2,3,2,3,2,2,0,3.0,112921,5.0,1,1,3,3,5326.0
133742,39933,28,1,11,0,4,2,2,5,2.0,585,2.0,1,0,2,3,7072.0


In [27]:
submission = pd.DataFrame({"id": to_pred["0"], "days": predictions_submit})
submission

Unnamed: 0,id,days
0,314114,21-30
1,208989,21-30
2,305872,21-30
3,266099,21-30
4,13228,21-30
...,...,...
133739,318155,21-30
133740,144850,21-30
133741,180676,21-30
133742,39933,21-30


In [28]:
sample = pd.read_csv("sample_submission.csv")

In [29]:
if submission.shape == sample.shape:
    if submission.columns.all() == sample.columns.all():
        if submission.id.all() == sample.id.all():
            print("you're ready to submit!")
            submission.to_csv("to_submit12.csv", index = False)

you're ready to submit!
