In [4]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

import numpy as np
import pandas as pd
import seaborn as sns

This dataset contains information on ticket prices for flights between six cities in India:

In [2]:
flights = pd.read_csv("flights.csv",index_col=0)
flights.head(5)

Unnamed: 0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
0,SpiceJet,SG-8709,Delhi,Evening,zero,Night,Mumbai,Economy,2.17,1,5953
1,SpiceJet,SG-8157,Delhi,Early_Morning,zero,Morning,Mumbai,Economy,2.33,1,5953
2,AirAsia,I5-764,Delhi,Early_Morning,zero,Early_Morning,Mumbai,Economy,2.17,1,5956
3,Vistara,UK-995,Delhi,Morning,zero,Afternoon,Mumbai,Economy,2.25,1,5955
4,Vistara,UK-963,Delhi,Morning,zero,Morning,Mumbai,Economy,2.33,1,5955


## 1
Create a boolean series called `trips` that is `True` only for flights in which *class* is `"Economy"`, *source_city* is `"Delhi"`, and *destination_city* is `"Mumbai"`.

In [5]:
trips = None
trips = (flights['class'] == 'Economy') & \
        (flights['source_city'] == 'Delhi') & \
        (flights['destination_city'] == 'Mumbai')

In [6]:
print(sum(trips), "trips are selected.")

9982 trips are selected.


In [7]:
### TESTS
assert sum(trips) == 9982, "Wrong number of trips selected."

print("OK")

OK


## 2
Prepare a feature matrix `X` from `flights`, keeping only rows in which `trips` is True and the columns *departure_time*, *airline*, *duration*, *days_left*, and *stops*. 

In the *stops* column, replace `"zero"` with 0, `"one"` with 1, and `"two_or_more"` with 2.
   
Prepare a feature vector `y` from `flights` for the column *price* and the same rows as for `X`.

In [None]:
X, y = None, None

filtered_flights = flights[trips]

X = filtered_flights[['departure_time', 'airline', 'duration', 'days_left', 'stops']].copy()

X['stops'] = X['stops'].replace({
    'zero': 0,
    'one': 1,
    'two_or_more': 2
})

y = filtered_flights['price']

In [9]:
print("X:")
X.head(5)

X:


Unnamed: 0,departure_time,airline,duration,days_left,stops
0,Evening,SpiceJet,2.17,1,0
1,Early_Morning,SpiceJet,2.33,1,0
2,Early_Morning,AirAsia,2.17,1,0
3,Morning,Vistara,2.25,1,0
4,Morning,Vistara,2.33,1,0


In [10]:
print("y:")
y.head(5)

y:


0    5953
1    5953
2    5956
3    5955
4    5955
Name: price, dtype: int64

In [11]:
### TESTS
assert set(X.columns) == {'airline', 'days_left', 'departure_time', 'duration', 'stops'}, "Column names of X are incorrect"
assert np.isclose(X["days_left"].mean(), 25.915948707673813, atol=1e-3, rtol=1e-3), "Incorrect rows included in X"
assert np.isclose(y.mean(), 6059.826086956522, atol=1e-3, rtol=1e-3), "Incorrect rows included in y"

print("OK")

OK


## 3
(5.2) Create `X1` from the columns *days_left* and *duration* in `X`. Perform a linear regression on `X1` for the price, and find the coefficient of determination for the fit. (Note: Do not split into training and test sets.)

In [15]:
coeff_det = None

from sklearn.linear_model import LinearRegression

X1 = X[['days_left', 'duration']]

model = LinearRegression()
model.fit(X1, y)

coeff_det = model.score(X1, y)


In [16]:
print(f"Linear regression on price has coeff of determination {coeff_det:.4f}")

Linear regression on price has coeff of determination 0.3812


In [17]:
### TESTS
assert set(X1.columns) == {"duration", "days_left"}, "Incorrect columns in X1"
assert coeff_det > 0.36 and coeff_det < 0.40, "Incorrect coefficient of determination"

print("OK")

OK


## 4
(5.2) Produce a series, indexed by the column names, of the weights (i.e., coefficient vector) from the model in step 3. (One weight will be negative and one will be positive.)

In [18]:
weights = None

weights = pd.Series(model.coef_, index=X1.columns)

In [19]:
print(weights)

days_left   -148.623863
duration     155.435459
dtype: float64


In [20]:
### TESTS
assert type(weights) == pd.Series, "Result must be a series"
assert set(weights.index) == {"days_left", "duration"}, "Result must be indexed by the column names"
assert weights.prod() < 0, "Weights have incorrect sign(s)"

print("OK")

OK


## 5
(5.2) Create a new frame `Xdum` from `X` that replaces the *airline* and *departure_time* features with dummy variables. **Use `drop_first=True` for when creating the dummies.** (This option avoids redundancy of the features.)

Train a pipeline with a robust scaler and a linear regressor using all the columns of `Xdum`. Compute the new coefficient of determination.

In [23]:
Xdum = None
coeff_det_dum = None

Xdum = pd.get_dummies(X, columns=['airline', 'departure_time'], drop_first=True)

pipeline = make_pipeline(RobustScaler(), LinearRegression())
pipeline.fit(Xdum, y)
coeff_det_dum = pipeline.score(Xdum, y)


In [24]:
print(f"Linear regression with dummies has coeff of determination {coeff_det_dum:.4f}")

Linear regression with dummies has coeff of determination 0.5051


In [None]:
### TESTS
assert Xdum.shape[1] == 13, "Incorrect number of columns in Xdum"
assert 0.55 > coeff_det_dum > 0.5, "Incorrect coefficient of determination"

print("OK")

## 6

(5.2) Create a series, indexed by the column names in `Xdum`, of the regression weights for the linear regressor from step 5, sorted in ascending order. (The negative values show how to decrease the price, and the positive ones show how to increase it.)


In [25]:
weights_dum = None

weights_dum = pd.Series(pipeline.named_steps['linearregression'].coef_, index=Xdum.columns).sort_values()


In [26]:
print(weights_dum)

days_left                      -3554.766309
departure_time_Night            -151.580915
departure_time_Evening           -98.531068
departure_time_Early_Morning     261.790043
duration                         262.785192
departure_time_Late_Night        367.749335
departure_time_Morning           626.479993
airline_SpiceJet                 897.852995
airline_Indigo                   970.524844
airline_GO_FIRST                1857.536586
airline_Air_India               2376.574484
airline_Vistara                 2771.648938
stops                           2835.409323
dtype: float64


In [27]:
### TESTS
assert type(weights_dum) == pd.Series, "Result must be a series" 
assert set(weights_dum.index) == set(Xdum.columns), "Result must be indexed by the feature names"
assert np.all( np.diff(weights_dum.values) > 0 ), "Result must be sorted"
assert np.isclose(weights_dum.sum(), 9423.473, rtol=1e-4)

print("OK")

OK


## 7
(5.3) Use LASSO linear regression on `Xdum` with regularization parameter $\alpha = 10$. Find a list or vector of the names of all features that have weights less than `1e-3` in absolute value.

In [31]:
dropped = None

from sklearn.linear_model import Lasso

lasso = make_pipeline(RobustScaler(), Lasso(alpha=10))
lasso.fit(Xdum, y)

coef = pd.Series(lasso.named_steps['lasso'].coef_, index=Xdum.columns)
dropped = coef[coef.abs() < 1e-3].index.tolist()


In [32]:
print("Columns to be dropped from the features:")
print(dropped)

Columns to be dropped from the features:
['airline_SpiceJet', 'departure_time_Late_Night']


In [33]:
### TESTS
assert np.all([ d in Xdum.columns for d in dropped] ), "Result must be a list or vector of feature names"
assert len(dropped) > 0, "At least one feature should be marked for dropping"

print("OK")

OK


## 8
(5.4) Use a decision tree regressor with maximum depth 5 on `Xdum` and compute its coefficient of determination score.

Based on this regressor, determine which feature is considered to be most important.

In [34]:
coeff_det_dtr = None
top_feature = ""   # should be a string
from sklearn.tree import DecisionTreeRegressor

dtr = DecisionTreeRegressor(max_depth=5)
dtr.fit(Xdum, y)
coeff_det_dtr = dtr.score(Xdum, y)

importances = pd.Series(dtr.feature_importances_, index=Xdum.columns)
top_feature = importances.idxmax()

In [35]:
print(f"Decision tree has coeff of determination {coeff_det_dtr:.4f}")
print("Most important decision tree feature is", top_feature)

Decision tree has coeff of determination 0.7257
Most important decision tree feature is days_left


In [36]:
### TESTS
assert 0.7 < coeff_det_dtr < 0.75, "Incorrect coefficient of determination"
assert type(top_feature) == str, "Top feature should be a string"
assert top_feature in Xdum.columns, "Top feature should be a column of Xdum"

print("OK")

OK
