# In this project we will be working on the dataset for Doctor's Consultation Fee to analyze the data and create a model to predict the fees. 
The  focus will be on transforming the data to be able to work with it and create a working model.

In [1]:
#Importing the Libraries

import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier,GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

In [2]:
#read the csv files

df=pd.read_csv("Final_Train.csv")
test=pd.read_csv("Final_Test.csv")

Lets check the details on data to get some insights.

In [3]:
df.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile,Miscellaneous_Info,Fees
0,"BHMS, MD - Homeopathy",24 years experience,100%,"Kakkanad, Ernakulam",Homeopath,"100% 16 Feedback Kakkanad, Ernakulam",100
1,"BAMS, MD - Ayurveda Medicine",12 years experience,98%,"Whitefield, Bangalore",Ayurveda,"98% 76 Feedback Whitefield, Bangalore",350
2,"MBBS, MS - Otorhinolaryngology",9 years experience,,"Mathikere - BEL, Bangalore",ENT Specialist,,300
3,"BSc - Zoology, BAMS",12 years experience,,"Bannerghatta Road, Bangalore",Ayurveda,"Bannerghatta Road, Bangalore ₹250 Available on...",250
4,BAMS,20 years experience,100%,"Keelkattalai, Chennai",Ayurveda,"100% 4 Feedback Keelkattalai, Chennai",250


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5960 entries, 0 to 5959
Data columns (total 7 columns):
Qualification         5960 non-null object
Experience            5960 non-null object
Rating                2659 non-null object
Place                 5935 non-null object
Profile               5960 non-null object
Miscellaneous_Info    3341 non-null object
Fees                  5960 non-null int64
dtypes: int64(1), object(6)
memory usage: 326.0+ KB


In [5]:
df.describe()

Unnamed: 0,Fees
count,5960.0
mean,307.954362
std,190.934916
min,5.0
25%,150.0
50%,300.0
75%,500.0
max,950.0


As wee can see there are 6 object data type columns and one integer. We need to convert this data. 

Lets combine the test and train data to be used at a later stage for label encoding to avoid errors when working on model.

While looking at the data, we will also be working on data cleaning for both train and test dataset as applicable and required.

In [6]:
data = [df[['Qualification', 'Experience', 'Rating', 'Place', 'Profile', 'Miscellaneous_Info']], test]

data = pd.concat(data)


In [7]:
data.isnull().sum()

Qualification            0
Experience               0
Rating                4391
Place                   31
Profile                  0
Miscellaneous_Info    3453
dtype: int64

In [8]:
test.isnull().sum()

Qualification            0
Experience               0
Rating                1090
Place                    6
Profile                  0
Miscellaneous_Info     834
dtype: int64

There are null values in 3 columns and we need to work on them while cleaning the data.

Lets work on the Qualification column to find the maximum number of qualifications a person has and then split the qualification column into maximum number of qualifications a person has.

In [9]:

Qualification = list(data['Qualification'])

# Finding Maximum number of Qualification mentioned in a single cell
maxim = 1
for i in Qualification :
    if len(i.split(',')) > maxim:
         maxim = len(i.split(','))
         
print("\n\nMaximum Qualificaton in a Cell : ", maxim)



Maximum Qualificaton in a Cell :  17


In [10]:
all_qualification = []

for i in Qualification :
    if len(i.split(',')) == 1:
         all_qualification.append(i.split(',')[0].strip().upper())
    else :
        for it in range(len(i.split(','))):
            all_qualification.append(i.split(',')[it].strip().upper())

print("\n\nNumber of Unique Qualifications : ", len(pd.Series(all_qualification).unique()))
print("\n\nUnique Qualifications:\n", pd.Series(all_qualification).unique())

all_qualification = list(pd.Series(all_qualification).unique())



Number of Unique Qualifications :  898


Unique Qualifications:
 ['BHMS' 'MD - HOMEOPATHY' 'BAMS' 'MD - AYURVEDA MEDICINE' 'MBBS'
 'MS - OTORHINOLARYNGOLOGY' 'BSC - ZOOLOGY' 'BDS' 'MD - GENERAL MEDICINE'
 'BSC' 'MS' 'DNB - ENT' 'MDS' 'MDS - ORAL & MAXILLOFACIAL SURGERY'
 'DIPLOMA IN OTORHINOLARYNGOLOGY (DLO)' 'MF- HOMEOPATHY' 'MS - ENT' 'DNB'
 'FELLOWSHIP OF COLLEGE OF GENERAL PRACTICE (FCGP)' 'DDVL' 'IBCLC (USA)'
 'POST GRADUATE DIPLOMA IN DIABETOLOGY (PGDD)' 'DIPLOMA IN DERMATOLOGY'
 'FELLOWSHIP IN AESTHETIC MEDICINE' 'DM - NEUROLOGY' 'MD - PEDIATRICS'
 'FELLOWSHIP IN CLEFT LIP AND CRANIOFACIAL SURGERY'
 'DIPLOMA IN COSMETOLOGY' 'AFIH' 'MDS - PROSTHODONTIST AND CROWN BRIDGE'
 'M. D. HOM. (PRACTICE OF MEDICINE)' 'DIPLOMA IN EMERGENCY MEDICINE'
 'DIPLOMA IN COUNSELLING SKILLS'
 'DHMS (DIPLOMA IN HOMEOPATHIC MEDICINE AND SURGERY)' 'MD - DERMATOLOGY'
 'VENEREOLOGY & LEPROSY' 'MDS - ORAL AND MAXILLOFACIAL PATHOLOGY'
 'MFDS RCS' 'MRCS (UK)' 'FRGUHS'
 'FELLOWSHIP IN DERMATOLOGICAL LASER S

In [11]:
Qualification = list(df['Qualification'])

Q1 = []
Q2 = []
Q3 = []
Q4 = []
Q5 = []
Q6 = []
Q7 = []
Q8 = []
Q9 = []
Q10 = []
Q11 = []
Q12 = []
Q13 = []
Q14 = []
Q15 = []
Q16 = []
Q17 = []


for i in Qualification:
        try :
            Q1.append(i.split(',')[0].strip().upper())
        except :
            Q1.append('NONE')
        try :
            Q2.append(i.split(',')[1].strip().upper())
        except :
            Q2.append('NONE')
        try :
            Q3.append(i.split(',')[2].strip().upper())
        except :
            Q3.append('NONE')
        try :
            Q4.append(i.split(',')[3].strip().upper())
        except :
            Q4.append('NONE')
        try :
            Q5.append(i.split(',')[4].strip().upper())
        except :
            Q5.append('NONE')
        try :
            Q6.append(i.split(',')[5].strip().upper())
        except :
            Q6.append('NONE')
        try :
            Q7.append(i.split(',')[6].strip().upper())
        except :
            Q7.append('NONE')
        try :
            Q8.append(i.split(',')[7].strip().upper())
        except :
            Q8.append('NONE')
        try :
            Q9.append(i.split(',')[8].strip().upper())
        except :
            Q9.append('NONE')    
        try :
            Q10.append(i.split(',')[9].strip().upper())
        except :
            Q10.append('NONE')
        try :
            Q11.append(i.split(',')[10].strip().upper())
        except :
            Q11.append('NONE')
        try :
            Q12.append(i.split(',')[11].strip().upper())
        except :
            Q12.append('NONE')
        try :
            Q13.append(i.split(',')[12].strip().upper())
        except :
            Q13.append('NONE')
        try :
            Q14.append(i.split(',')[13].strip().upper())
        except :
            Q14.append('NONE')
        try :
            Q15.append(i.split(',')[14].strip().upper())
        except :
            Q15.append('NONE')
        try :
            Q16.append(i.split(',')[15].strip().upper())
        except :
            Q16.append('NONE')
        try :
            Q17.append(i.split(',')[16].strip().upper())
        except :
            Q17.append('NONE')


In [12]:
# appending NONE to Unique qualification list
all_qualification.append('NONE')

Lets work on the Places column. This column has two pieces of information - locality and city. Lets split the Places column into two columns - Locality, City.

In [13]:
all_places = list(data['Place'])

for i in range(len(all_places)):
    if type(all_places[i]) == float:
        all_places[i] = 'NOT AVAILABLE'
    all_places[i] = all_places[i].strip().upper()
    
maxim = 1
for i in all_places :
    if len(i.split(',')) > maxim:
         maxim = len(i.split(','))
         
print("\n\nMaximum places in a Cell : ", maxim) 
print("\n\nNumber of Unique places (Including NOT AVAILABLE): ", len(pd.Series(all_places).unique()))
print("\n\nUnique Places:\n", pd.Series(all_places).unique())
 
all_places = list(pd.Series(all_places).unique())




Maximum places in a Cell :  2


Number of Unique places (Including NOT AVAILABLE):  948


Unique Places:
 ['KAKKANAD, ERNAKULAM' 'WHITEFIELD, BANGALORE'
 'MATHIKERE - BEL, BANGALORE' 'BANNERGHATTA ROAD, BANGALORE'
 'KEELKATTALAI, CHENNAI' 'PORUR, CHENNAI' 'KAROL BAGH, DELHI'
 'AREKERE, BANGALORE' 'OLD CITY, HYDERABAD' 'ATHANI, ERNAKULAM'
 'THOUSAND LIGHTS, CHENNAI' 'SOMAJIGUDA, HYDERABAD'
 'COIMBATORE RACECOURSE, COIMBATORE' 'JUBILEE HILLS, HYDERABAD'
 'KONDLI, DELHI' 'SAROOR NAGAR, HYDERABAD' 'TAMBARAM WEST, CHENNAI'
 'PURASAWAKKAM, CHENNAI' 'KPHB, HYDERABAD' 'HSR LAYOUT, BANGALORE'
 'POLLACHI, COIMBATORE' 'VASUNDHRA ENCLAVE, DELHI' 'CHEMBUR, MUMBAI'
 'THAMMANAM, ERNAKULAM' 'ANDHERI, MUMBAI' 'PATTOM, THIRUVANANTHAPURAM'
 'KUKATPALLY, HYDERABAD' 'VADAVALLI, COIMBATORE' 'DEFENCE COLONY, DELHI'
 'BANJARA HILLS, HYDERABAD' 'SION WEST, MUMBAI' 'CR PARK, DELHI'
 'MOGAPPAIR EAST, CHENNAI' 'IP EXTENSION, DELHI'
 'SAFDARJUNG ENCLAVE, DELHI' 'NEW FRIENDS COLONY, DELHI'
 'BORIVALI WEST, MUMBAI

In [14]:
places= list(df['Place'])


Locality=[]
City=[]

for i in places:
    try :
         Locality.append(i.split(',')[0].strip().upper())
    except :
         Locality.append('NONE')
    try :
         City.append(i.split(',')[1].strip().upper())
    except :
         City.append('NONE')

# appending NONE to Unique titles list
all_places.append('NONE')

Lets work on the experience column. This column has text which we need to split and get the integer data from the info.

We will be defining a function to extract the integer value and use the function for test and train data.

In [15]:
def changeexperience(x):
    bans=x.split(' ')
    return bans[0]

In [16]:
data['Experience']=data['Experience'].apply(changeexperience)
test['Experience']=test['Experience'].apply(changeexperience)
df['Experience']=df['Experience'].apply(changeexperience)



We will be dropping the Miscellaneous_Info column as the information in it is mostly utilized in different columns and also more than 50% of the data is missing values.

In [17]:
data.drop('Miscellaneous_Info',axis=1,inplace=True)
test.drop('Miscellaneous_Info',axis=1,inplace=True)
df.drop('Miscellaneous_Info',axis=1,inplace=True)


Lets work on the Ratings column. This column has numeric value with percentage symbol.

We will be creating a function to extract the numeric value from it and apply the same function to train and test data. 

Also, as we need to apply the function, we will be creating a temp data with no missing values and apply the same. 

Once we have the data, we will be filling the missing values with mean values for each profile type.

In [18]:
def removepercent(x):
    vans=x.split('%')
    return vans[0]

In [19]:
Rating_Values=data[data['Rating'].notnull()]
Rating_Values.head()


Unnamed: 0,Qualification,Experience,Rating,Place,Profile
0,"BHMS, MD - Homeopathy",24,100%,"Kakkanad, Ernakulam",Homeopath
1,"BAMS, MD - Ayurveda Medicine",12,98%,"Whitefield, Bangalore",Ayurveda
4,BAMS,20,100%,"Keelkattalai, Chennai",Ayurveda
7,BDS,10,99%,"Arekere, Bangalore",Dentist
12,"BDS, MDS",9,98%,"Coimbatore Racecourse, Coimbatore",Dentist


In [20]:
Rating_Values['Rating']=Rating_Values['Rating'].apply(removepercent)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
Rating_Values.head()

Unnamed: 0,Qualification,Experience,Rating,Place,Profile
0,"BHMS, MD - Homeopathy",24,100,"Kakkanad, Ernakulam",Homeopath
1,"BAMS, MD - Ayurveda Medicine",12,98,"Whitefield, Bangalore",Ayurveda
4,BAMS,20,100,"Keelkattalai, Chennai",Ayurveda
7,BDS,10,99,"Arekere, Bangalore",Dentist
12,"BDS, MDS",9,98,"Coimbatore Racecourse, Coimbatore",Dentist


In [22]:
Rating_Values['Rating']=Rating_Values['Rating'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
Rating_Values.groupby('Profile')['Rating'].mean()


Profile
Ayurveda            96.277108
Dentist             97.698454
Dermatologists      93.446281
ENT Specialist      88.973236
General Medicine    91.944238
Homeopath           96.524496
Name: Rating, dtype: float64

In [24]:
def fillratings(cols):
    Rating = cols[0]
    Profile = cols[1]
    
    if pd.isnull(Rating):

        if Profile == 'Ayurveda':
            return '96%'

        elif Profile == 'Dentist':
            return '98%'
        elif Profile=='Dermatologists':
            return '94%'
        elif Profile=='ENT Specialist':
            return '89%'
        elif Profile=='General Medicine':
            return '92%'
        elif Profile=='Homeopath':
            return '97%'
    else:
        return Rating

In [25]:
data['Rating']=data[['Rating','Profile']].apply(fillratings,axis=1)
df['Rating']=df[['Rating','Profile']].apply(fillratings,axis=1)
test['Rating']=test[['Rating','Profile']].apply(fillratings,axis=1)


In [26]:
df.isnull().sum()

Qualification     0
Experience        0
Rating            0
Place            25
Profile           0
Fees              0
dtype: int64

In [27]:
df.isnull().sum()

Qualification     0
Experience        0
Rating            0
Place            25
Profile           0
Fees              0
dtype: int64

In [28]:
df.shape

(5960, 6)

Now lets verify the profile column to see if any cell has more than one value.

In [29]:

Profile = list(data['Profile'])

# Finding Maximum number of Qualification mentioned in a single cell
maxim = 1
for i in Profile :
    if len(i.split(',')) > maxim:
         maxim = len(i.split(','))
         
print("\n\nMaximum Profile in a Cell : ", maxim)



Maximum Profile in a Cell :  1


Lets create a Train Dataset with all the modified and clean columns.

In [30]:
Train_Data = {}

Train_Data['Qualification1'] = Q1
Train_Data['Qualification2'] = Q2
Train_Data['Qualification3'] = Q3
Train_Data['Qualification4'] = Q4
Train_Data['Qualification5'] = Q5
Train_Data['Qualification6'] = Q6
Train_Data['Qualification7'] = Q7
Train_Data['Qualification8'] = Q8
Train_Data['Qualification9'] = Q9
Train_Data['Qualification10'] = Q10
Train_Data['Qualification11'] = Q11
Train_Data['Qualification12'] = Q12
Train_Data['Qualification13'] = Q13
Train_Data['Qualification14'] = Q14
Train_Data['Qualification15'] = Q15
Train_Data['Qualification16'] = Q16
Train_Data['Qualification17'] = Q17
Train_Data['LOCALITY'] = Locality
Train_Data['CITY'] = City
Train_Data['RATING'] = df["Rating"]
Train_Data['EXPERIENCE'] = df["Experience"]
Train_Data['PROFILE'] = df["Profile"]
Train_Data['FEES'] = df["Fees"]

Train_Data = pd.DataFrame(Train_Data)

In [31]:
Train_Data.head()

Unnamed: 0,CITY,EXPERIENCE,FEES,LOCALITY,PROFILE,Qualification1,Qualification10,Qualification11,Qualification12,Qualification13,...,Qualification17,Qualification2,Qualification3,Qualification4,Qualification5,Qualification6,Qualification7,Qualification8,Qualification9,RATING
0,ERNAKULAM,24,100,KAKKANAD,Homeopath,BHMS,NONE,NONE,NONE,NONE,...,NONE,MD - HOMEOPATHY,NONE,NONE,NONE,NONE,NONE,NONE,NONE,100%
1,BANGALORE,12,350,WHITEFIELD,Ayurveda,BAMS,NONE,NONE,NONE,NONE,...,NONE,MD - AYURVEDA MEDICINE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,98%
2,BANGALORE,9,300,MATHIKERE - BEL,ENT Specialist,MBBS,NONE,NONE,NONE,NONE,...,NONE,MS - OTORHINOLARYNGOLOGY,NONE,NONE,NONE,NONE,NONE,NONE,NONE,89%
3,BANGALORE,12,250,BANNERGHATTA ROAD,Ayurveda,BSC - ZOOLOGY,NONE,NONE,NONE,NONE,...,NONE,BAMS,NONE,NONE,NONE,NONE,NONE,NONE,NONE,96%
4,CHENNAI,20,250,KEELKATTALAI,Ayurveda,BAMS,NONE,NONE,NONE,NONE,...,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,100%


In [32]:
Train_Data['RATING']=Train_Data['RATING'].apply(removepercent)
Train_Data['RATING']=Train_Data['RATING'].astype(int)
Train_Data['EXPERIENCE']=Train_Data['EXPERIENCE'].astype(int)

In [33]:
Train_Data.head()

Unnamed: 0,CITY,EXPERIENCE,FEES,LOCALITY,PROFILE,Qualification1,Qualification10,Qualification11,Qualification12,Qualification13,...,Qualification17,Qualification2,Qualification3,Qualification4,Qualification5,Qualification6,Qualification7,Qualification8,Qualification9,RATING
0,ERNAKULAM,24,100,KAKKANAD,Homeopath,BHMS,NONE,NONE,NONE,NONE,...,NONE,MD - HOMEOPATHY,NONE,NONE,NONE,NONE,NONE,NONE,NONE,100
1,BANGALORE,12,350,WHITEFIELD,Ayurveda,BAMS,NONE,NONE,NONE,NONE,...,NONE,MD - AYURVEDA MEDICINE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,98
2,BANGALORE,9,300,MATHIKERE - BEL,ENT Specialist,MBBS,NONE,NONE,NONE,NONE,...,NONE,MS - OTORHINOLARYNGOLOGY,NONE,NONE,NONE,NONE,NONE,NONE,NONE,89
3,BANGALORE,12,250,BANNERGHATTA ROAD,Ayurveda,BSC - ZOOLOGY,NONE,NONE,NONE,NONE,...,NONE,BAMS,NONE,NONE,NONE,NONE,NONE,NONE,NONE,96
4,CHENNAI,20,250,KEELKATTALAI,Ayurveda,BAMS,NONE,NONE,NONE,NONE,...,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,NONE,100


Now lest perform the same on Test data

In [34]:
Qualification = list(test['Qualification'])

Q1 = []
Q2 = []
Q3 = []
Q4 = []
Q5 = []
Q6 = []
Q7 = []
Q8 = []
Q9 = []
Q10 = []
Q11 = []
Q12 = []
Q13 = []
Q14 = []
Q15 = []
Q16 = []
Q17 = []


for i in Qualification:
        try :
            Q1.append(i.split(',')[0].strip().upper())
        except :
            Q1.append('NONE')
        try :
            Q2.append(i.split(',')[1].strip().upper())
        except :
            Q2.append('NONE')
        try :
            Q3.append(i.split(',')[2].strip().upper())
        except :
            Q3.append('NONE')
        try :
            Q4.append(i.split(',')[3].strip().upper())
        except :
            Q4.append('NONE')
        try :
            Q5.append(i.split(',')[4].strip().upper())
        except :
            Q5.append('NONE')
        try :
            Q6.append(i.split(',')[5].strip().upper())
        except :
            Q6.append('NONE')
        try :
            Q7.append(i.split(',')[6].strip().upper())
        except :
            Q7.append('NONE')
        try :
            Q8.append(i.split(',')[7].strip().upper())
        except :
            Q8.append('NONE')
        try :
            Q9.append(i.split(',')[8].strip().upper())
        except :
            Q9.append('NONE')    
        try :
            Q10.append(i.split(',')[9].strip().upper())
        except :
            Q10.append('NONE')
        try :
            Q11.append(i.split(',')[10].strip().upper())
        except :
            Q11.append('NONE')
        try :
            Q12.append(i.split(',')[11].strip().upper())
        except :
            Q12.append('NONE')
        try :
            Q13.append(i.split(',')[12].strip().upper())
        except :
            Q13.append('NONE')
        try :
            Q14.append(i.split(',')[13].strip().upper())
        except :
            Q14.append('NONE')
        try :
            Q15.append(i.split(',')[14].strip().upper())
        except :
            Q15.append('NONE')
        try :
            Q16.append(i.split(',')[15].strip().upper())
        except :
            Q16.append('NONE')
        try :
            Q17.append(i.split(',')[16].strip().upper())
        except :
            Q17.append('NONE')

In [35]:
places= list(test['Place'])


Locality=[]
City=[]

for i in places:
    try :
         Locality.append(i.split(',')[0].strip().upper())
    except :
         Locality.append('NONE')
    try :
         City.append(i.split(',')[1].strip().upper())
    except :
         City.append('NONE')


In [36]:
test['Rating']=test['Rating'].apply(removepercent)
test['Rating']=test['Rating'].astype(int)
test['Experience']=test['Experience'].astype(int)

In [37]:
Test_Data = {}

Test_Data['Qualification1'] = Q1
Test_Data['Qualification2'] = Q2
Test_Data['Qualification3'] = Q3
Test_Data['Qualification4'] = Q4
Test_Data['Qualification5'] = Q5
Test_Data['Qualification6'] = Q6
Test_Data['Qualification7'] = Q7
Test_Data['Qualification8'] = Q8
Test_Data['Qualification9'] = Q9
Test_Data['Qualification10'] = Q10
Test_Data['Qualification11'] = Q11
Test_Data['Qualification12'] = Q12
Test_Data['Qualification13'] = Q13
Test_Data['Qualification14'] = Q14
Test_Data['Qualification15'] = Q15
Test_Data['Qualification16'] = Q16
Test_Data['Qualification17'] = Q17
Test_Data['LOCALITY'] = Locality
Test_Data['CITY'] = City
Test_Data['RATING'] = test["Rating"]
Test_Data['EXPERIENCE'] = test["Experience"]
Test_Data['PROFILE'] = test["Profile"]

Test_Data = pd.DataFrame(Test_Data)

Now we need to label encode the categorical values.

We will be first combining the test and train data to use the label encoding.

In [38]:
all_cities=[]
all_localities=[]
all_profile=[]
all_cities = [Train_Data[['CITY']], Test_Data[['CITY']]]
all_localities = [Train_Data[['LOCALITY']], Test_Data[['LOCALITY']]]
all_profile=[Train_Data[['PROFILE']], Test_Data[['PROFILE']]]
all_cities = pd.concat(all_cities)
all_localities = pd.concat(all_localities)
all_profile = pd.concat(all_profile)


In [39]:
le_qualification = LabelEncoder()
le_profile = LabelEncoder()

le_city = LabelEncoder()

le_locality = LabelEncoder()


le_qualification.fit(all_qualification)
le_profile.fit(all_profile)

le_city.fit(all_cities)
le_locality.fit(all_localities)

  y = column_or_1d(y, warn=True)


LabelEncoder()

In [40]:
Train_Data.isnull().values.any()

False

In [41]:
Test_Data.isnull().values.any()

False

In [42]:
Train_Data['Qualification1'] = le_qualification.transform(Train_Data['Qualification1'])
Train_Data['Qualification2'] = le_qualification.transform(Train_Data['Qualification2'])
Train_Data['Qualification3'] = le_qualification.transform(Train_Data['Qualification3'])
Train_Data['Qualification4'] = le_qualification.transform(Train_Data['Qualification4'])
Train_Data['Qualification5'] = le_qualification.transform(Train_Data['Qualification5'])
Train_Data['Qualification6'] = le_qualification.transform(Train_Data['Qualification6'])
Train_Data['Qualification7'] = le_qualification.transform(Train_Data['Qualification7'])
Train_Data['Qualification8'] = le_qualification.transform(Train_Data['Qualification8'])
Train_Data['Qualification9'] = le_qualification.transform(Train_Data['Qualification9'])
Train_Data['Qualification10'] = le_qualification.transform(Train_Data['Qualification10'])
Train_Data['Qualification11'] = le_qualification.transform(Train_Data['Qualification11'])
Train_Data['Qualification12'] = le_qualification.transform(Train_Data['Qualification12'])
Train_Data['Qualification13'] = le_qualification.transform(Train_Data['Qualification13'])
Train_Data['Qualification14'] = le_qualification.transform(Train_Data['Qualification14'])
Train_Data['Qualification15'] = le_qualification.transform(Train_Data['Qualification15'])
Train_Data['Qualification16'] = le_qualification.transform(Train_Data['Qualification16'])
Train_Data['Qualification17'] = le_qualification.transform(Train_Data['Qualification17'])
Train_Data['LOCALITY'] = le_locality.transform(Train_Data['LOCALITY'])
Train_Data['CITY'] = le_city.transform(Train_Data['CITY'])
Train_Data['PROFILE'] = le_profile.transform(Train_Data['PROFILE'])


In [43]:
Test_Data['Qualification1'] = le_qualification.transform(Test_Data['Qualification1'])
Test_Data['Qualification2'] = le_qualification.transform(Test_Data['Qualification2'])
Test_Data['Qualification3'] = le_qualification.transform(Test_Data['Qualification3'])
Test_Data['Qualification4'] = le_qualification.transform(Test_Data['Qualification4'])
Test_Data['Qualification5'] = le_qualification.transform(Test_Data['Qualification5'])
Test_Data['Qualification6'] = le_qualification.transform(Test_Data['Qualification6'])
Test_Data['Qualification7'] = le_qualification.transform(Test_Data['Qualification7'])
Test_Data['Qualification8'] = le_qualification.transform(Test_Data['Qualification8'])
Test_Data['Qualification9'] = le_qualification.transform(Test_Data['Qualification9'])
Test_Data['Qualification10'] = le_qualification.transform(Test_Data['Qualification10'])
Test_Data['Qualification11'] = le_qualification.transform(Test_Data['Qualification11'])
Test_Data['Qualification12'] = le_qualification.transform(Test_Data['Qualification12'])
Test_Data['Qualification13'] = le_qualification.transform(Test_Data['Qualification13'])
Test_Data['Qualification14'] = le_qualification.transform(Test_Data['Qualification14'])
Test_Data['Qualification15'] = le_qualification.transform(Test_Data['Qualification15'])
Test_Data['Qualification16'] = le_qualification.transform(Test_Data['Qualification16'])
Test_Data['Qualification17'] = le_qualification.transform(Test_Data['Qualification17'])
Test_Data['LOCALITY'] = le_locality.transform(Test_Data['LOCALITY'])
Test_Data['CITY'] = le_city.transform(Test_Data['CITY'])
Test_Data['PROFILE'] = le_profile.transform(Test_Data['PROFILE'])


In [44]:
# Classifying Independent and Dependent Features
#_______________________________________________

# Dependent Variable
Y_train = Train_Data['FEES']

# Independent Variables
X_train = Train_Data.drop(['FEES'], axis = 1)

# Independent Variables for Test Set
X_test = Test_Data.iloc[:,:].values
Y_test=[]


Now lets use Random Forest

In [45]:
m = RandomForestClassifier(n_jobs=2,random_state=1)
m.fit(X_train,Y_train)
m.score(X_train,Y_train)

0.9765100671140939

Decision Tree Classifier

In [50]:
dt = DecisionTreeClassifier(random_state = 0 , max_depth = 5)
dt.fit(X_train , Y_train)
dt.score(X_train , Y_train)

0.2966442953020134

As we can see the Random Forest has an accuracy of 97.65 which is pretty good for the model. In order to avoid overfitting or underfitting we will be hypertuning our model using Grid searchCV.

In [51]:
m.predict(X_test)

array([150, 300,  50, ..., 100, 250, 200], dtype=int64)

In [52]:
estimator = RandomForestRegressor()
param_grid = { "n_estimators" :[50,100,200] ,
               "max_features" : ["auto", "sqrt", "log2"] ,
               "bootstrap": [True, False] ,
               "min_samples_split" : [2,4,8]
             }
clf = GridSearchCV(estimator, param_grid, cv=5)
clf.fit(X_train,Y_train)
print(f"best parameters: {clf.best_params_}")
print(f"best score: {clf.score(X_train,Y_train)}")

best parameters: {'bootstrap': True, 'max_features': 'log2', 'min_samples_split': 8, 'n_estimators': 100}
best score: 0.65022174131293


In [53]:
#Saving the best model for future use

from sklearn.externals import joblib
joblib.dump(clf,'Doctor Fee Prediction.obj')

['Doctor Fee Prediction.obj']