In [5]:
# Read in the college data
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

collegeData = pd.read_csv("tentativeCollegeData.csv")

state_book = {
    "AL": "Alabama", 
    "AR": "Arkansas",
    "AZ": "Arizona",
    "CA": "California",
    "CO": "Colorado",
    "CT": "Connecticut",
    "DC": "Washington, DC",
    "FL": "Florida",
    "GA": "Georgia",
    "HI": "Hawaii",
    "ID": "Idaho",
    "IL": "Illinois",
    "IN": "Indiana",
    "IA": "Iowa",
    "KS": "Kansas",
    "KY": "Kentucky",
    "LA": "Louisiana",
    "ME": "Maine",
    "MD": "Maryland",
    "MA": "Massachusetts",
    "MI": "Michigan",
    "MN": "Minnesota",
    "MS": "Mississippi",
    "MO": "Missouri",
    "MT": "Montana",
    "NE": "Nebraska",
    "NV": "Nevada",
    "NH": "New Hampshire",
    "NJ": "New Jersey",
    "NM": "New Mexico",
    "NY": "New York",
    "NC": "North Carolina",
    "ND": "North Dakota",
    "OH": "Ohio",
    "OK": "Oklahoma",
    "OR": "Oregon",
    "PA": "Pennsylvania",
    "RI": "Rhode Island",
    "SC": "South Carolina",
    "SD": "South Dakota",
    "TN": "Tennessee",
    "TX": "Texas",
    "UT": "Utah",
    "VT": "Vermont",
    "VA": "Virginia",
    "WA": "Washington",
    "WV": "West Virginia",
    "WI": "Wisconsin"
# note Alaska, Delaware, and Wyoming are missing 
}

collegeData["General Location"] = collegeData["School Name"] + " " + collegeData["City"]

# recommendation options:


# 1. location - probably concatenation of university name + city + maybe state

# 2. Cost of living - use on campus and off campus housing costs - maybe have a slider so user can say put 6k and it will look for colleges with 6k living or lower

# 3. Maybe similar as cost of living for admission rate

# 4. 

In [6]:
def clean_schoolName(schoolName):
    return re.sub("[^a-zA-Z0-9 ]", "", schoolName)

collegeData["General Location"].apply(clean_schoolName)



0                         Alabama A  M University Normal
1         University of Alabama at Birmingham Birmingham
2         University of Alabama in Huntsville Huntsville
3                    Alabama State University Montgomery
4                   The University of Alabama Tuscaloosa
                             ...                        
981                Emory UniversityOxford College Oxford
982                             Husson University Bangor
983                  Purdue University Northwest Hammond
984    Commonwealth University of Pennsylvania Blooms...
985           Pennsylvania Western University California
Name: General Location, Length: 986, dtype: object

In [7]:
import ipywidgets as widgets
from IPython.display import display

search_vectorizer = TfidfVectorizer(ngram_range=(1,2))

college_tfidif = search_vectorizer.fit_transform(collegeData["General Location"])


def search_college(collegeName):
   collegeName = clean_schoolName(collegeName)
   query_colleges = search_vectorizer.transform([collegeName])
   similaritycollege = cosine_similarity(query_colleges, college_tfidif).flatten()
   collegeIndices = np.argpartition(similaritycollege, -10)[-10:] # 5 most similar colleges
   collegeResults = collegeData.iloc[collegeIndices][::-1]

   return collegeResults

#college_input = widgets.Text(
   # value = "",
   # description = "School Name:",
   # disabled = False
#)

#college_input

In [None]:
college_input = widgets.Text(
    value = "",
    description = "School Name:",
    disabled = False
)

# setup an input for school names

college_list = widgets.Output()

# list of colleges to display?

def collegetypes(givenData):
    with college_list:
        college_list.clear_output()
        collegeTitle = givenData["new"]
        if len(collegeTitle) > 5:
            display(search_college(collegeTitle))

college_input.observe(collegetypes, names = "value")

display(college_input, college_list)
# idea

# synthetic student data where there exists:

# GPA, parent income, location, desired location, extracurriculars, SAT test scores, and general area description (rural, urban, etc.)

# parent income affects what colleges will be recommended based on instate tuition, on campus housing cost, and off campus housing cost
# 
# GPA, extracurriculars, and SAT test scores affects what colleges will be recommended based on average sat score and admission rate
# 
# location and desired location will affect which colleges are recommended based on college name, city, state, population size, and degree of urbanization

Text(value='', description='School Name:')

Output()

In [9]:
# tentative sliders for numeric features

from ipywidgets import HBox, Label
admissionRate_slider = widgets.FloatSlider(
    value = 0.5,
    min = 0,
    max = 1,
    step = 0.01,
    disabled = False,
    continuous_update = False
)

studentPopulationSize_slider = widgets.FloatSlider(
    value = 30000,
    min = 0,
    max = 60000,
    step = 1000,
    disabled = False,
    continuous_update = False
)

instateTuition_slider = widgets.FloatSlider(
    value = 30000,
    min = 0,
    max = 70000,
    step = 1000,
    disabled = False,
    continuous_update = False
)

onCampus_housingCostSlider = widgets.FloatSlider(
    value = 12000,
    min = 0,
    max = 25000,
    step = 1000,
    disabled = False,
    continuous_update = False
)

offCampus_housingCostSlider = widgets.FloatSlider(
    value = 13000,
    min = 0,
    max = 26000,
    step = 1000,
    disabled = False,
    continuous_update = False
)

studentFacultyRatio_Slider = widgets.FloatSlider(
    value = 14,
    min = 0,
    max = 29,
    step = 1,
    disabled = False,
    continuous_update = False
)

medianEarnings_Slider = widgets.FloatSlider(
    value = 61000,
    min = 0,
    max = 125000,
    step = 1000,
    disabled = False,
    continuous_update = False
)

averageSAT_scoreSlider = widgets.FloatSlider(
    value = 800,
    min = 0,
    max = 1600,
    step = 10,
    disabled = False,
    continuous_update = False
)

sliderVec = [admissionRate_slider, studentPopulationSize_slider, studentFacultyRatio_Slider, 
             instateTuition_slider, onCampus_housingCostSlider, offCampus_housingCostSlider, 
             medianEarnings_Slider, averageSAT_scoreSlider]

sliderDescriptions = ["Admission Rate", "Student Population Size",  "Student to Faculty Ratio", 
                      "In-State Tuition", "On-Campus Housing Cost", "Off-Campus Housing Cost", 
                      "Median Earnings after 7 Years", "Average SAT Score"]



In [10]:
for each_slider in range(len(sliderVec)):
    display(HBox([Label(sliderDescriptions[each_slider]), sliderVec[each_slider]]))

# displaying the sliders with descriptions for what the values represent


HBox(children=(Label(value='Admission Rate'), FloatSlider(value=0.5, continuous_update=False, max=1.0, step=0.…

HBox(children=(Label(value='Student Population Size'), FloatSlider(value=30000.0, continuous_update=False, max…

HBox(children=(Label(value='Student to Faculty Ratio'), FloatSlider(value=14.0, continuous_update=False, max=2…

HBox(children=(Label(value='In-State Tuition'), FloatSlider(value=30000.0, continuous_update=False, max=70000.…

HBox(children=(Label(value='On-Campus Housing Cost'), FloatSlider(value=12000.0, continuous_update=False, max=…

HBox(children=(Label(value='Off-Campus Housing Cost'), FloatSlider(value=13000.0, continuous_update=False, max…

HBox(children=(Label(value='Median Earnings after 7 Years'), FloatSlider(value=61000.0, continuous_update=Fals…

HBox(children=(Label(value='Average SAT Score'), FloatSlider(value=800.0, continuous_update=False, max=1600.0,…

In [None]:
collegeNamesDesc = collegeData["School Name"]
type(collegeNamesDesc)

collegeNamesDesc.to_csv("collegeNames.txt", sep = "\t", index = False)


# make a new column with the highest earning majors 