In [78]:
import pandas as pd 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
college = pd.read_csv('../Datasets/college/college.csv')
print(college.columns)
ranking = pd.read_csv('../Datasets/college/cwurData.csv')
ranking.columns

Index(['Unnamed: 0', 'Private', 'Apps', 'Accept', 'Enroll', 'Top10perc',
       'Top25perc', 'F.Undergrad', 'P.Undergrad', 'Outstate', 'Room.Board',
       'Books', 'Personal', 'PhD', 'Terminal', 'S.F.Ratio', 'perc.alumni',
       'Expend', 'Grad.Rate'],
      dtype='object')


Index(['world_rank', 'institution', 'country', 'national_rank',
       'quality_of_education', 'alumni_employment', 'quality_of_faculty',
       'publications', 'influence', 'citations', 'broad_impact', 'patents',
       'score', 'year'],
      dtype='object')

In [83]:
# Get the colleges in common
schools_in_college = college['Unnamed: 0']
schools_in_ranking = ranking['institution']
common_schools = pd.Series(list(set(schools_in_ranking).intersection(set(schools_in_college))))
# Get the sub-dataframe for common schools
schools_in_college = college[college.iloc[:, 0].isin(common_schools)]
# Notice the colleges in ranking actually has duplicates institutions. We drop them and keep the top ranking to combine two sub-dataframes
schools_in_ranking = ranking[ranking.iloc[:, 1].isin(common_schools)].drop_duplicates(subset=['institution'], keep='first')
# Merge two dataframes
merged_df = schools_in_college.merge(schools_in_ranking, left_on='Unnamed: 0', right_on='institution').dropna()
merged_df.columns
merged_df.head

<bound method NDFrame.head of                           Unnamed: 0 Private  Apps  Accept  Enroll  Top10perc  \
0                  Baylor University     Yes  6075    5349    2367         34   
2     Bowling Green State University      No  9251    7333    3076         14   
3                Brandeis University     Yes  4186    2743     740         48   
7                Clarkson University     Yes  2174    1953     557         35   
8                 Clemson University      No  8065    5257    2301         37   
..                               ...     ...   ...     ...     ...        ...   
85             University of Wyoming      No  2029    1516    1073         23   
87  Virginia Commonwealth University      No  4963    3497    1567         18   
88            Wake Forest University     Yes  5661    2392     903         75   
89               Wesleyan University     Yes  4772    1973     712         60   
90       Western Michigan University      No  9167    7191    2738         24  

In [106]:
# Select features
features = merged_df.select_dtypes(exclude=[object]).columns
features = features.drop(['world_rank', 'national_rank'])
X = merged_df[features]
y = merged_df['world_rank']

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# Select model
model = RandomForestRegressor(random_state = 1)
# Train model
model.fit(X_train, y_train)
# error
print(f"Training error: {mean_squared_error(y_train, model.predict(X_train))}")
print(f"Test error: {mean_squared_error(y_test, model.predict(X_test))}")

# Show some example
samples = merged_df.institution[:8]
for s in samples: 
    print(s)
    print("\t True ranking", merged_df.loc[merged_df.institution == s]['world_rank'].values[0])
    print("\t Predicted ranking", model.predict(merged_df.loc[merged_df.institution == s][features]))

Training error: 191.18504897959198
Test error: 564.7696117647062
Baylor University
	 True ranking 473
	 Predicted ranking [476.09]
Bowling Green State University
	 True ranking 793
	 Predicted ranking [789.54]
Brandeis University
	 True ranking 236
	 Predicted ranking [239.61]
Clarkson University
	 True ranking 766
	 Predicted ranking [786.09]
Clemson University
	 True ranking 510
	 Predicted ranking [503.64]
College of William and Mary
	 True ranking 361
	 Predicted ranking [371.76]
Concordia University
	 True ranking 422
	 Predicted ranking [420.82]
Creighton University
	 True ranking 490
	 Predicted ranking [487.26]
