In [None]:
!pip install datacleaner

In [None]:
!pip install fasteda

# Importing Libraries

In [None]:
#Ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import datacleaner
from datacleaner import autoclean
from fasteda import fast_eda
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#Visualizations
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Loading

In [None]:
df = pd.read_csv("/kaggle/input/flight-price-prediction/Clean_Dataset.csv")
df.head()

# Data Inspection

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

# Data Cleaning and Preprocessing

In [None]:
df.drop(["Unnamed: 0"], inplace = True, axis =1 )
df.head()

In [None]:
df = df.rename(columns = {
    "airline": "X0",
    "flight": "X1",
    "source_city": "X2",
    "departure_time": "X3",
    "stops": "X4",
    "arrival_time": "X5",
    "destination_city": "X6",
    "class": "X7",
    "duration": "X8",
    "days_left": "X9",
    "price": "Y"
})

In [None]:
df = autoclean(df)
df.head()

## Missing Values

In [None]:
df.isnull().sum()

## Duplicates

In [None]:
df.duplicated().sum()

## Outliers

In [None]:
for column in df:
        plt.figure(figsize=(17,1))
        sns.boxplot(data=df, x=column)

In [None]:
features = ["X8", "Y"]

for column in df[features]:
        plt.figure(figsize=(17,1))
        sns.boxplot(data = df, x = column)

In [None]:
for col in df[features]:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    whisker_width = 1.5
    lower_whisker = q1 - (whisker_width * iqr)
    upper_whisker = q3 + whisker_width * iqr
    df[col] = np.where(df[col] > upper_whisker, upper_whisker, np.where(df[col] < lower_whisker, lower_whisker, df[col]))

In [None]:
features = ["X8", "Y"]

for column in df[features]:
        plt.figure(figsize=(17,1))
        sns.boxplot(data = df, x = column)

## Skewness

In [None]:
from sklearn.preprocessing import QuantileTransformer

qt = QuantileTransformer(output_distribution='normal')

In [None]:
df["X1"] = qt.fit_transform(pd.DataFrame(df["X1"]))
pd.DataFrame(df["X1"]).plot.density(color='purple')
plt.show()

In [None]:
df["X8"] = qt.fit_transform(pd.DataFrame(df["X8"]))
pd.DataFrame(df["X8"]).plot.density(color='purple')
plt.show()

In [None]:
df["X9"] = qt.fit_transform(pd.DataFrame(df["X9"]))
pd.DataFrame(df["X9"]).plot.density(color='purple')
plt.show()

In [None]:
df["Y"] = qt.fit_transform(pd.DataFrame(df["Y"]))
pd.DataFrame(df["Y"]).plot.density(color='purple')
plt.show()

## Dropping Highly Correlated Features

In [None]:
df.drop(["X4"], inplace = True, axis =1 )
df.head()

# Exploratory Data Analysis

In [None]:
fast_eda(df)

# Machine Learning

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor 

from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

In [None]:
X = df.drop('Y', axis = 1)
y = df['Y']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [None]:
p1 = sns.barplot(x = ['Train', 'Test'],
            y = [y_train.shape[0], y_test.shape[0]],
                linewidth = 1.5,
                edgecolor = 'black')
p1.bar_label(p1.containers[0])

plt.title('The size of the training and test set', fontweight = 'bold')
plt.show()

## Random Forest Regressor

In [None]:
rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

In [None]:
y_pred = rfr.predict(X_test)

In [None]:
print('MAE:', mean_absolute_error(y_test,y_pred))
print('r2_score:', r2_score(y_test,y_pred))
print('MAPE:', mean_absolute_percentage_error(y_test,y_pred))

In [None]:
imp_df = pd.DataFrame({
    "Feature Name": X_train.columns,
    "Importance": rfr.feature_importances_
})
fi = imp_df.sort_values(by="Importance", ascending=False)

fi2 = fi.head(10)
plt.figure(figsize=(10,8))
sns.barplot(data=fi2, x='Importance', y='Feature Name')
plt.title('Top Feature Importance Each Attributes (Random Forest)', fontsize=18)
plt.xlabel ('Importance', fontsize=16)
plt.ylabel ('Feature Name', fontsize=16)
plt.show()

# Sincerely, Mr. Eslam Fouad.