In [64]:
#importing the necessary libs

import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [22]:
#read the csv document and view the first 10 rows of the dataframe

date_cols = ['Admission Date',  'Discharge Date', 'Enrolment Date']
df = pd.read_csv(r'Claims Data _RSBY Gujarat cluster 2 Date Formatted.csv', parse_dates=date_cols, infer_datetime_format=True)
df.head(10)

Unnamed: 0,URN,Unique Claim no.,HOF Name,Patient Age,Patient Card Gender,Member ID,Beneficiary District Name,Hospital Name,Hospital Location/district,Admission Date,Discharge Date,Package Name,Claimed Amount,Approved Amount,Status,Enrolment Type,Enrolment Date
0,2.41e+16,GBIC-1,MRS.DAYABHAI,12,1,2.0,Surendranagar,Pt Deendayal Upadhyay Hos,Rajkot,2016-05-24,2016-05-30,General Ward :Unspecified,4500,4500,Paid,Renewal,2016-01-04
1,2.41e+16,GBIC-2,SARBAIBEN,80,2,1.0,Surendranagar,Pt Deendayal Upadhyay Hos,Rajkot,2016-05-24,2016-05-25,General Ward :Unspecified,750,750,Paid,Renewal,2016-01-04
2,2.42e+16,GBIC-3,BHAVINBHAI,32,1,1.0,Anand,Urja Orthopedic Hospital,Vadodara,2016-05-27,2016-06-01,General Ward :Unspecified,3750,3750,Paid,Renewal,2016-01-04
3,2.42e+16,GBIC-4,THAKOR PRATAPBHAI HEMTABHAI,70,1,1.0,Kheda,Sagar Surgical Hospital,Kheda,2016-05-27,2016-06-02,General Ward :Unspecified,4500,0,Denied,Renewal,2016-01-04
4,2.42e+16,GBIC-6,THAKOR PRATAPBHAI HEMTABHAI,18,2,1.0,Kheda,Sagar Surgical Hospital,Kheda,2016-05-27,2016-06-02,General Ward :Unspecified,4500,0,Denied,Renewal,2016-01-04
5,2.42e+16,GBIC-7,MADHABHAI,70,2,1.0,Anand,Advanced Eye Care Clinic,Vadodara,2016-06-03,2016-06-03,Cataract with foldable IOL by Phoco emulsifica...,6000,6000,Paid,Renewal,2016-01-04
6,2.42e+16,GBIC-8,MADUKHAN,30,2,1.0,Kheda,Dhiraj Hospital,Vadodara,2016-05-20,2016-06-03,Spine - Intradural Tumour,15400,15400,Paid,Renewal,2016-01-04
7,2.41e+16,GBIC-9,NATHABHAI,52,2,1.0,Surendranagar,shradhdha surgical hospit,Rajkot,2016-05-30,2016-06-04,Salphingo-oophorectomy,7875,6500,Paid,Renewal,2016-01-04
8,2.41e+16,GBIC-10,NATHABHAI,52,2,1.0,Surendranagar,shradhdha surgical hospit,Rajkot,2016-05-30,2016-06-04,Cystocele - Anterior repair,5750,0,Denied,Renewal,2016-01-04
9,2.41e+16,GBIC-11,NATHABHAI,52,2,1.0,Surendranagar,shradhdha surgical hospit,Rajkot,2016-05-30,2016-06-04,Hysterectomy - abdominal*,11500,11500,Paid,Renewal,2016-01-04


In [23]:
#show all the col type
df.dtypes

URN                                  float64
Unique Claim no.                      object
HOF Name                              object
Patient Age                            int64
Patient Card Gender                    int64
Member ID                            float64
Beneficiary District Name             object
Hospital Name                         object
Hospital Location/district            object
Admission Date                datetime64[ns]
Discharge Date                datetime64[ns]
Package Name                          object
Claimed Amount                         int64
Approved Amount                        int64
Status                                object
Enrolment Type                        object
Enrolment Date                datetime64[ns]
dtype: object

In [24]:
#view the infomation of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10446 entries, 0 to 10445
Data columns (total 17 columns):
URN                           10446 non-null float64
Unique Claim no.              10446 non-null object
HOF Name                      10446 non-null object
Patient Age                   10446 non-null int64
Patient Card Gender           10446 non-null int64
Member ID                     10443 non-null float64
Beneficiary District Name     10446 non-null object
Hospital Name                 10446 non-null object
Hospital Location/district    10446 non-null object
Admission Date                10446 non-null datetime64[ns]
Discharge Date                10446 non-null datetime64[ns]
Package Name                  10445 non-null object
Claimed Amount                10446 non-null int64
Approved Amount               10446 non-null int64
Status                        10446 non-null object
Enrolment Type                10446 non-null object
Enrolment Date                10446 non-null 

In [25]:
#function to replace categorical variables with nominal variable
def recoding(col, codeDict):
  colCoded = pd.Series(col, copy=True)
  for key, value in codeDict.items():
    colCoded.replace(key, value, inplace=True)
  return colCoded

In [42]:
#function to convert all string cells to uppercase
headers = list(df)
str_header = []
for header in headers:
   if df[header].dtype == object:
    str_header.append(header) 
    df[header] = df[header].str.upper()
    df[header] = df[header].str.strip()

In [43]:
#recoding Status from categorical to nominal
df['Status (Coded)'] = recoding(df['Status'], {'PAID':1, 'DENIED': 0})

#checking
print (pd.value_counts(df['Status']))
print (pd.value_counts(df['Status (Coded)']))

PAID      9515
DENIED     931
Name: Status, dtype: int64
1    9515
0     931
Name: Status (Coded), dtype: int64


In [44]:
#recoding Enrolment Type from categorical to nominal
df['Enrolment Type (Coded)'] = recoding(df['Enrolment Type'], {'RENEWAL': 1, 'FRESH': 2})

#checking
print (pd.value_counts(df['Enrolment Type']))
print (pd.value_counts(df['Enrolment Type (Coded)']))

RENEWAL    7088
FRESH      3358
Name: Enrolment Type, dtype: int64
1    7088
2    3358
Name: Enrolment Type (Coded), dtype: int64


In [55]:
#creating a dataframe of only numerical variables
num_df = df.select_dtypes(include=[np.number]).copy().dropna(axis=0)
num_df

Unnamed: 0,URN,Patient Age,Patient Card Gender,Member ID,Claimed Amount,Approved Amount,Status (Coded),Enrolment Type (Coded)
0,2.410000e+16,12,1,2.0,4500,4500,1,1
1,2.410000e+16,80,2,1.0,750,750,1,1
2,2.420000e+16,32,1,1.0,3750,3750,1,1
3,2.420000e+16,70,1,1.0,4500,0,0,1
4,2.420000e+16,18,2,1.0,4500,0,0,1
5,2.420000e+16,70,2,1.0,6000,6000,1,1
6,2.420000e+16,30,2,1.0,15400,15400,1,1
7,2.410000e+16,52,2,1.0,7875,6500,1,1
8,2.410000e+16,52,2,1.0,5750,0,0,1
9,2.410000e+16,52,2,1.0,11500,11500,1,1


In [70]:
num_df.columns.get_loc('Status (Coded)')

6

In [71]:
#splitting the dataframe into the X and Y components
y = num_df.iloc[:, 6]
x = num_df.drop(['URN', 'Status (Coded)'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33)

In [75]:
#running the random hyper parameter optimisation
best_score = 0  
best_params = {'C': None, 'gamma': None}

#for 100 iterations seek best params
for i in range(100):
    #try random values for each hyperparameter
    c_ran = np.random.uniform(0.001, 100)
    gamma_ran = np.random.randint(0.001, 100)
    svc = SVC(C=c_ran, gamma=gamma_ran)
    svc.fit(X_train, y_train)
    score = svc.score(X_test, y_test)

    if score > best_score:
        best_score = score
        best_params['C'] = c_ran
        best_params['gamma'] = gamma_ran

best_score, best_params  

(0.9370467072816943, {'C': 17.407592514188444, 'gamma': 1})