# AIS Summer Comp (Advay Vyas)

### Imports and data

In [111]:
# imports
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import pandas as pd
from math import sqrt

In [112]:
# load data
train_file_path = 'C:\\Users\\advay\\OneDrive\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\train.csv'
all_data = pd.read_csv(train_file_path)

In [113]:
# get target
y = all_data['Median House Price']
y.head()

0    455800
1    365700
2    298300
3    334200
4    219100
Name: Median House Price, dtype: int64

In [114]:
# creating X
features = ['Population', 'Median Age', 'Unemployment Rate', 'Median Income', 'Poverty Rate']
X = all_data[features]
X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,840562,35.6,5.1,7.8,26.5
1,913161,37.5,4.0,6.2,54.5
2,768917,33.0,6.6,15.2,24.5
3,710626,42.1,3.1,4.6,53.2
4,791257,38.3,3.5,8.1,47.9


### Validating and manipulating data

In [115]:
# splitting data
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=1)

### Creating and fitting model on train split

In [116]:
# creating model
randomForestModel = RandomForestRegressor(random_state = 1)

# fitting model
randomForestModel.fit(train_X, train_y)

RandomForestRegressor(random_state=1)

In [117]:
# predicting
modelPredictions = randomForestModel.predict(test_X)
print(modelPredictions)

[178959. 217536. 408893. 170103. 157927. 251512. 299478. 430042. 272563.
 261325. 210511. 622167. 284814. 366795. 267922. 459707. 335156. 222525.
 139289. 550688. 412079. 434101. 175369. 507036. 253107. 272880. 167486.
 740697. 255915. 164221. 420249. 221332. 521085. 587107. 661990. 437516.
 484677. 677219. 520711. 138394. 263057. 297774. 228996. 298759. 394742.
 518966. 446809. 580488. 293547. 226901.]


In [118]:
# calculating error
modelRMSE = sqrt(mean_squared_error(test_y, modelPredictions))
print('RMSE for Random Forest Model: {:,.0f}'.format(modelRMSE))

RMSE for Random Forest Model: 152,279


### Fitting model on full data

In [119]:
# create model
randomForestFullModel = RandomForestRegressor(random_state = 1)

# fit model
randomForestFullModel.fit(X, y)

RandomForestRegressor(random_state=1)

### Predicting competition data

In [120]:
# reading competition data
competition_data_path = 'C:\\Users\\advay\\OneDrive\\Coding\\VSCode\\Python\\AIS Summer Comp 2022\\evaluation_input.csv'
competition_data = pd.read_csv(competition_data_path)
competition_data.head()

Unnamed: 0,ID,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,TX35,857654,33.0,4.1,18.4,25.2
1,PR16,678333,43.0,4.6,13.5,28.3
2,NY4,730314,40.4,3.6,5.7,43.6
3,OR1,858875,38.0,3.8,8.9,40.7
4,GA8,706237,37.6,5.3,17.3,22.7


In [121]:
# creating competition X
competition_X = competition_data[features]
competition_X.head()

Unnamed: 0,Population,Median Age,Unemployment Rate,Median Income,Poverty Rate
0,857654,33.0,4.1,18.4,25.2
1,678333,43.0,4.6,13.5,28.3
2,730314,40.4,3.6,5.7,43.6
3,858875,38.0,3.8,8.9,40.7
4,706237,37.6,5.3,17.3,22.7


In [122]:
# competition predictions
competition_preds = randomForestFullModel.predict(competition_X)
print(competition_preds)

[ 181738.  263462.  454759.  328474.  153556.  194011.  173298.  335298.
  362043.  130125.  521973.  425683.  354109.  327334.  187811.  157011.
  372066.  219968.  198160.  208461.  278100.  155264.  189246.  457153.
  576324.  909342.  505619.  457460.  586953.  231183.  239555.  549469.
  192125.  165380.  185361.  318349.  207000.  315502.  211717.  261434.
  209660.  134028.  206585.  447752.  258035.  574107.  308588.  313950.
  385112.  133314.  211185. 1207717.  375946.  190700.  225641.  196329.
  305897.  200436.  272230.  193756.  487682.  319045.  427510.  172412.
  150495.  482180.  227938.  464570.  529465.  449783.  215140.  201318.
  284037.  146698.  188042.  407872.  254414.  150379.  156590.  205359.
  156432.  169006.  166054.  409481.  714932.  240354.  193833.  400645.
  212027.  282047.  163117.  483459.  358026.  448904.  172271.  170288.
  364400.  163471.  325038.  255650.  260925.  568234.  145507.  173352.
  286103.  244111.  436884.  160782.  158679.  2300

### Generating submission

In [123]:
# Creating and outputting DataFrame
output = pd.DataFrame({'ID': competition_data.ID, 'Median House Price': competition_preds})
output.to_csv('Vyas_Advay_answer.csv', index=False)