**Importing Packages** 

In [None]:
import pandas as pd
import numpy as np

**Creating the dataframe for the CSV input**

In [None]:
#Reading csv file
data=pd.read_csv('Customers.csv')
data

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6
...,...,...,...,...,...,...,...,...
1995,1996,Female,71,184387,40,Artist,8,7
1996,1997,Female,91,73158,32,Doctor,7,7
1997,1998,Male,87,90961,14,Healthcare,9,2
1998,1999,Male,77,182109,4,Executive,7,2


**Data Preprocessing**

In [None]:
#Dropping rows with null value
data.dropna(inplace=True)

In [None]:
data

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
0,1,Male,19,15000,39,Healthcare,1,4
1,2,Male,21,35000,81,Engineer,3,3
2,3,Female,20,86000,6,Engineer,1,1
3,4,Female,23,59000,77,Lawyer,0,2
4,5,Female,31,38000,40,Entertainment,2,6
...,...,...,...,...,...,...,...,...
1995,1996,Female,71,184387,40,Artist,8,7
1996,1997,Female,91,73158,32,Doctor,7,7
1997,1998,Male,87,90961,14,Healthcare,9,2
1998,1999,Male,77,182109,4,Executive,7,2


## **Descriptive Statistics**

In [None]:
#rows and columns in dataset
data.shape

(1965, 8)

In [None]:
data.columns

Index(['CustomerID', 'Gender', 'Age', 'Annual Income ($)',
       'Spending Score (1-100)', 'Profession', 'Work Experience',
       'Family Size'],
      dtype='object')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1965 entries, 0 to 1999
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              1965 non-null   int64 
 1   Gender                  1965 non-null   object
 2   Age                     1965 non-null   int64 
 3   Annual Income ($)       1965 non-null   int64 
 4   Spending Score (1-100)  1965 non-null   int64 
 5   Profession              1965 non-null   object
 6   Work Experience         1965 non-null   int64 
 7   Family Size             1965 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 138.2+ KB


In [None]:
#Statistical summary of dataset 
data.describe

<bound method NDFrame.describe of       CustomerID  Gender  Age  Annual Income ($)  Spending Score (1-100)  \
0              1    Male   19              15000                      39   
1              2    Male   21              35000                      81   
2              3  Female   20              86000                       6   
3              4  Female   23              59000                      77   
4              5  Female   31              38000                      40   
...          ...     ...  ...                ...                     ...   
1995        1996  Female   71             184387                      40   
1996        1997  Female   91              73158                      32   
1997        1998    Male   87              90961                      14   
1998        1999    Male   77             182109                       4   
1999        2000    Male   90             110610                      52   

         Profession  Work Experience  Family Size  
0

In [None]:
#unique values in column category
len(data['Profession'].unique())

9

In [None]:
#counts value in each category
data['Profession'].value_counts()

Artist           612
Healthcare       339
Entertainment    234
Engineer         179
Doctor           161
Executive        153
Lawyer           142
Marketing         85
Homemaker         60
Name: Profession, dtype: int64

## **Languages belonging to INDO-EUROPEAN Family**

In [None]:
Artist_Profession=data[data['Profession']=='Artist']
Artist_Profession

Unnamed: 0,CustomerID,Gender,Age,Annual Income ($),Spending Score (1-100),Profession,Work Experience,Family Size
5,6,Female,22,58000,76,Artist,0,2
9,10,Female,30,98000,72,Artist,1,4
19,20,Female,35,62000,98,Artist,0,1
22,23,Female,46,42000,5,Artist,13,2
23,24,Male,31,71000,73,Artist,5,2
...,...,...,...,...,...,...,...,...
1987,1988,Male,63,59244,80,Artist,7,1
1988,1989,Female,54,118944,77,Artist,4,4
1990,1991,Female,30,166983,69,Artist,7,3
1993,1994,Female,64,175254,100,Artist,9,5


In [None]:
Artist_Profession.describe

<bound method NDFrame.describe of       CustomerID  Gender  Age  Annual Income ($)  Spending Score (1-100)  \
5              6  Female   22              58000                      76   
9             10  Female   30              98000                      72   
19            20  Female   35              62000                      98   
22            23  Female   46              42000                       5   
23            24    Male   31              71000                      73   
...          ...     ...  ...                ...                     ...   
1987        1988    Male   63              59244                      80   
1988        1989  Female   54             118944                      77   
1990        1991  Female   30             166983                      69   
1993        1994  Female   64             175254                     100   
1995        1996  Female   71             184387                      40   

     Profession  Work Experience  Family Size  
5    

In [None]:
import plotly.express as px
fig = px.box(Artist_Profession, y='Age')
fig.show()

## **Summary**


*  The box plot shows the Artist_Profession defined by Age on y axis

*   By hovering over the point on box plot are showing Minnimum,First Quartile,Median,Third Quartile,Inter Quatile Range and outliers as respective ages for Artist profession.


*  Median Age could be seen as 47.
*  Inter Quatile range could be seen as (q1-q3) which is (25,72)





In [None]:
fig = px.scatter(data, x="Annual Income ($)",y='Spending Score (1-100)')
fig.show()

In [None]:
new_df = data.groupby(['Profession','Annual Income ($)'])
new_df.first()

Unnamed: 0_level_0,Unnamed: 1_level_0,CustomerID,Gender,Age,Spending Score (1-100),Work Experience,Family Size
Profession,Annual Income ($),Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Artist,0,170,Male,32,63,2,2
Artist,2000,97,Female,47,47,0,1
Artist,3000,253,Female,78,14,5,4
Artist,4000,128,Male,40,95,0,2
Artist,7000,248,Female,32,16,1,2
...,...,...,...,...,...,...,...
Marketing,181026,1956,Female,11,39,6,2
Marketing,181183,1993,Male,94,24,9,3
Marketing,182937,319,Male,66,44,8,1
Marketing,184324,358,Male,20,51,13,5


### **Splitting the Dataset in to Traning and Testing DataSets**

In [None]:
Values=['Annual Income ($)','Age']
x=data[Values]
y=data['Spending Score (1-100)']


In [None]:
from sklearn.model_selection import train_test_split

#x = np.array([data[['Annual Income ($)','Age']]]).reshape((-1, 1))
#y = np.array([data[['Spending Score (1-100)']]]).reshape((-1, 1))

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

## **Model Fitting**

In [None]:
from sklearn.linear_model import LinearRegression
L = LinearRegression()
model=L.fit(x_train, y_train)

In [None]:
y_pred = L.predict(x_test)
y_pred

array([49.2648318 , 51.13909097, 47.67965673, 51.61908568, 51.96441269,
       49.61281525, 49.23901731, 48.51424778, 54.42191446, 52.83993925,
       51.60590585, 48.61485461, 50.76703558, 48.90818874, 47.79663331,
       53.11965711, 47.31528973, 53.42762344, 53.36931204, 50.86624725,
       48.51262183, 49.77707372, 53.47811288, 49.16161963, 49.43116997,
       52.61493397, 49.77703048, 50.40351441, 50.65937399, 52.60601713,
       52.92626864, 51.40081967, 48.61775112, 52.92726291, 48.54914792,
       49.96434869, 51.29092477, 51.62261274, 52.69144114, 49.50562388,
       51.76749619, 53.04230573, 52.55383095, 52.9973079 , 49.61956398,
       49.67486833, 50.77029743, 51.61678545, 50.24780534, 50.95212048,
       49.99337776, 53.00620727, 51.26243671, 52.3696617 , 50.85353507,
       50.0159048 , 49.63189992, 52.01520597, 51.36975892, 48.71962002,
       51.29116614, 50.04626609, 52.83511263, 53.06073117, 51.29906722,
       49.95366878, 51.62185216, 52.21699233, 51.05070014, 51.91

**Calculating R^2 and Mean Squared Error**

In [None]:
from sklearn.metrics import r2_score, mean_squared_error
print('R2 score',r2_score(y_test, y_pred))
print('Mean squared error', np.sqrt(mean_squared_error(y_test, y_pred)))

R2 score -0.008993054695426661
Mean squared error 28.624303049795785


### **Ridge Regression**

In [None]:
from sklearn.linear_model import Ridge
#R = Ridge(alpha=0.0001)
R = Ridge(alpha=0.1)

In [None]:
R.fit(x_train, y_train)

In [None]:
y_pred1 = R.predict(x_test)

In [None]:
print('R2 score',r2_score(y_test, y_pred1))
print('Mean squared error', np.sqrt(mean_squared_error(y_test, y_pred1)))

R2 score -0.008993054099047493
Mean squared error 28.62430304133639


**Grid Search**

In [None]:
from sklearn.model_selection import GridSearchCV
parameters1= [{'alpha': [0.001,0.1,1, 10, 100, 1000, 10000, 100000, 100000]}]
parameters1

[{'alpha': [0.001, 0.1, 1, 10, 100, 1000, 10000, 100000, 100000]}]

In [None]:

RR=Ridge()
RR

In [None]:
Grid1 = GridSearchCV(RR, parameters1,cv=4)

In [None]:
Grid1.fit(x,y)

In [None]:
BestRR = Grid1.best_estimator_
BestRR

In [None]:
BestRR.fit(x_train, y_train)

In [None]:
BestRR.score(x,y)

0.0016551601389337423

In [None]:
%matplotlib inline
import seaborn as sns

In [None]:
def DistributionPlot(RedFunction, BlueFunction, RedName, BlueName, Title):
    width = 12
    height = 10
    plt.figure(figsize=(width, height))

    #ax1 = sns.distplot(RedFunction, hist=False, color="r", label=RedName)
    #ax2 = sns.distplot(BlueFunction, hist=False, color="b", label=BlueName, ax=ax1)

    ax1 = sns.distplot(RedFunction, color="r", label=RedName)
    ax2 = sns.distplot(BlueFunction, color="b", label=BlueName, ax=ax1)

    plt.title(Title)
    plt.xlabel('Value of x')
    plt.ylabel('Value of y')

    plt.show()
    plt.close()