In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [None]:
# sep is the delimiter and na_values specifies additional strings to recognize as NaN
crime = pd.read_table('CommViolPredUnnormalizedData.txt', sep=',', na_values='?')
# specifying which column indicies are relevant. Here we can add the lists together to form a bigger list
# for example, [5,6] + list(range(11,26)) = [5,6,11,12,...,25]
columns_to_keep = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145] 
# df.iloc indexes the rows of the dataframe to keep. For example (crime.iloc(0)) would print the first row
# crime.iloc([0], [1]) would print the element in the first row second column

# here we are saying that we want the elements in all of the rows, but only with the columns specified in
# columns_to_keep (also drop all missing values with dropna())
crime = crime.iloc[:,columns_to_keep].dropna()

# X_crime consists of all the rows in the dataframe left over after the previous iloc and 88 of the columns
X_crime = crime.iloc[:,range(0,88)]
# y_crime consists of only the ViolentCrimesPerPop column
y_crime = crime['ViolentCrimesPerPop']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

# ridge regression is similar to linear regression except it also adds an aditional term
# to the previous formula: sum(yi - (wix + b))^2
# the formula for ridge regression is: sum(yi - (wix + b))^2 + (alpha)sum(wi^2)
# ridge regression adds the square sum of the weights in order to prevent overfitting. This
# is because we don't want the model to be too accustomed to the weights of the training set.
# notice that the alpha value will dictate how much of a priority the weighted sum is. An alpha
# value of zero will lead to this being a normal linear regression
linridge = Ridge(alpha=20.0).fit(X_train, y_train)

print("ridge regression linear model intercept: {}".format(linridge.intercept_))
print("ridge regression linear model coefficients {}".format(linridge.coef_))
print("R-squared score (training): {:.3f}".format(linridge.score(X_train, y_train)))
print("R-squared score (test): {:.3f}".format(linridge.score(X_test, y_test)))

# Now With Preprocessing the Image with MinMax

In [None]:
# the formula for minmax scaling is: xnew = (xi - xmin)/(xmax - xmin)
# this will normalize all values to be between 0 and 1 where 0 is the min value
# and 1 is the max value
scaler = MinMaxScaler()

# normalizing the values
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

print("ridge regression linear model intercept: {}".format(linridge.intercept_))
print("ridge regression linear model coefficients {}".format(linridge.coef_))
print("R-squared score (training): {:.3f}".format(linridge.score(X_train_scaled, y_train)))
print("R-squared score (test): {:.3f}".format(linridge.score(X_test_scaled, y_test)))