In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

In [None]:
# sep is the delimiter and na_values specifies additional strings to recognize as NaN
crime = pd.read_table('CommViolPredUnnormalizedData.txt', sep=',', na_values='?')
# specifying which column indicies are relevant. Here we can add the lists together to form a bigger list
# for example, [5,6] + list(range(11,26)) = [5,6,11,12,...,25]
columns_to_keep = [5, 6] + list(range(11,26)) + list(range(32, 103)) + [145] 
# df.iloc indexes the rows of the dataframe to keep. For example (crime.iloc(0)) would print the first row
# crime.iloc([0], [1]) would print the element in the first row second column

# here we are saying that we want the elements in all of the rows, but only with the columns specified in
# columns_to_keep (also drop all missing values with dropna())
crime = crime.iloc[:,columns_to_keep].dropna()

# X_crime consists of all the rows in the dataframe left over after the previous iloc and 88 of the columns
X_crime = crime.iloc[:,range(0,88)]
# y_crime consists of only the ViolentCrimesPerPop column
y_crime = crime['ViolentCrimesPerPop']

In [None]:
scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X_crime, y_crime, random_state=0)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

# lasso is similar to the ridge regression except the sum is minimimzed using the absolute value
# of the weights: sum(yi - (wix + b))^2 + (alpha)sum(abs(wi))
# use ridge regression when there are many small/medium effects
# use lasso regression when there are only a few variables with medium/large effects
linlasso = Lasso(alpha=2.0, max_iter=10000).fit(X_train_scaled, y_train)

print("lasso regression linear model intercept: {}".format(linlasso.intercept_))
print("lasso regression linear model coefficients: {}".format(linlasso.coef_))
print("R-squared score (train): {:.3f}".format(linlasso.score(X_train_scaled, y_train)))
print("R-square score (test): {:.3f}".format(linlasso.score(X_test_scaled, y_test)))

# from this analysis of the crime dataset, we can see that features like percentage of kids born to
# people who never married had the most positive correlation with crime
prominent_features = {}
temp = (list(zip(X_crime, linlasso.coef_)))
for x,y in temp:
    prominent_features[abs(y)] = x
for coef, feature in sorted(prominent_features.items(), reverse=True):
    print(feature + ':  ' + str(coef))