# Hunt Price For North America

## Importing required modules

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from statistics import mode

## Getting dataset

In [2]:
df = pd.read_csv("data/proj96/Mihalikdata.csv")
df.head()

Unnamed: 0,Species,Province/State,Price,Number of days,Day price,S Rank,Status,Classification,Avg Mass,SCI,Latitude
0,Mountain Lion,Alberta,15930,10.0,1593,S4,4.0,Carnivore,60.4,Y,54.5
1,Mountain Lion,Alberta,9500,10.0,950,S4,4.0,Carnivore,60.4,Y,54.5
2,Mountain Lion,Alberta,10000,10.0,1000,S4,4.0,Carnivore,60.4,Y,54.5
3,Mountain Lion,Alberta,12500,10.0,1250,S4,4.0,Carnivore,60.4,Y,54.5
4,Mountain Lion,Arizona,5000,5.0,1000,S4,4.0,Carnivore,60.4,Y,34.1661


## Handling nan values

In [3]:
df.isna().sum()

Species            0
Province/State     0
Price              0
Number of days     0
Day price          0
S Rank            47
Status            47
Classification     0
Avg Mass           0
SCI                0
Latitude           0
dtype: int64

In [6]:
df["S Rank"].fillna(mode(df["S Rank"]), inplace=True)
df["Status"].fillna(mode(df["Status"]), inplace=True)

In [7]:
df.isna().sum()

Species           0
Province/State    0
Price             0
Number of days    0
Day price         0
S Rank            0
Status            0
Classification    0
Avg Mass          0
SCI               0
Latitude          0
dtype: int64

## Pre-Processing data

In [8]:
df.drop(["Latitude"], axis=1, inplace=True)

In [9]:
labels = {}

for column in df.keys():
    if pd.api.types.is_object_dtype(df[column]):
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        labels[column] = le.classes_
    else:
        pass
    

In [10]:
labels

{'Species': array(['Bighorn Sheep', 'Black Bear', 'Brown Bear', 'Caribou', 'Elk',
        'Gray Wolf', 'Moose', 'Mountain Goat', 'Mountain Lion',
        'Mule Deer', 'Muskox', 'Polar Bear', 'Pronghorn', 'Thinhorn Sheep',
        'White-tailed Deer'], dtype=object),
 'Province/State': array(['Alabama', 'Alaska', 'Alberta', 'Arizona', 'British Columbia',
        'California', 'Colorado', 'Florida', 'Georgia', 'Idaho',
        'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Maine',
        'Manitoba', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
        'Nebraska', 'Nevada', 'New Brunswick', 'New Hampshire',
        'New Mexico', 'New York', 'Newfoundland', 'North Carolina',
        'North Dakota', 'Northwest Territories', 'Nunavut', 'Oklahoma',
        'Ontario', 'Oregon', 'Pennsylvania', 'Quebec', 'Saskatchewan',
        'South Dakota', 'Texas', 'Utah', 'Vermont', 'Virginia',
        'Washington', 'Wyoming', 'Yukon'], dtype=object),
 'S Rank': array(['S1', 'S2', 'S2S3', 'S3

In [11]:
df.head()

Unnamed: 0,Species,Province/State,Price,Number of days,Day price,S Rank,Status,Classification,Avg Mass,SCI
0,8,2,15930,10.0,1593,5,4.0,0,60.4,1
1,8,2,9500,10.0,950,5,4.0,0,60.4,1
2,8,2,10000,10.0,1000,5,4.0,0,60.4,1
3,8,2,12500,10.0,1250,5,4.0,0,60.4,1
4,8,3,5000,5.0,1000,5,4.0,0,60.4,1


In [12]:
X = df.drop("Price", axis=1)
y = df["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Training model

In [15]:
rreg = RandomForestRegressor()
rreg.fit(X_train.values, y_train.values)
rreg.score(X_test.values, y_test.values)

0.9543785951107655

In [16]:
xreg = XGBRegressor()
xreg.fit(X_train.values, y_train.values)
xreg.score(X_test.values, y_test.values)

0.986284247452582

## Prediction

In [18]:
xreg.predict([[4, 8, 8, 1300, 5, 4, 1, 60, 1]])

array([9983.994], dtype=float32)