# WEEK 4 PROJECT


## 1. Import required libraries and read the dataset.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import KNNImputer,SimpleImputer
from scipy.stats import zscore
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import warnings 
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('Apps_data+(1).csv')
df.head()

### 2. Check the first few samples, shape, info of the data and try to familiarize yourself with different features.

In [None]:
df.sample(5)

In [None]:
rows,columns = df.shape
print('Number of rows : ',rows)
print('Number of columns : ',columns)

In [None]:
df.info()

### 3. Check summary statistics of the dataset. List out the columns that need to be worked upon for model building.


In [None]:
df.describe().T

In [None]:
# all the categorical column needs to be worked for model buiding
df.describe(include=object).T

In [None]:
df1 = df.copy()

### 4. Check if there are any duplicate records in the dataset? if any drop them.

In [None]:
print('Any duplicate records in the dataset?',df1.duplicated().any())
df1[df1.duplicated()].count()


In [None]:
df1.drop_duplicates(inplace=True)

In [None]:
df1[df1.duplicated()].count()

###  5. Check the unique categories of the column 'Category', Is there any invalid category? If yes, drop them.

In [None]:
df1.Category.unique()

In [None]:
df1['Category'].value_counts()

In [None]:
# 1.9 is the invalid category
df1.drop(df1[df1['Category'] == '1.9'].index,inplace=True)

###  6. Check if there are missing values present in the column Rating, If any? drop them and and create a new column as 'Rating_category' by converting ratings to high and low categories(>3.5 is high rest low)


In [None]:
df1['Rating'].isnull().sum()

In [None]:
df1.dropna(subset=['Rating'],inplace=True)

In [None]:
df1.Rating.isnull().sum()

In [None]:
df1['Rating_category'] = df1['Rating'].apply(lambda x: 'High' if x>3.5 else 'Low')
df1.head()

###  7. Check the distribution of the newly created column 'Rating_category' and comment on the distribution.

In [None]:
sns.displot(df1['Rating_category'])
plt.show()

#####  *There are nearly 8000 apps that have ratings more than 3.5
##### *less than 1000 apps having ratings less than 3.5


### 8. Convert the column "Reviews'' to numeric data type and check the presence of outliers in the column and handle the outliers using a transformation approach.(Hint: Use log transformation)

In [None]:
df1.Reviews.dtypes

In [None]:
df1.Reviews

In [None]:
# df1.Reviews.iloc[8644] = 3.0

In [None]:
# df1.Reviews.astype(float)

In [None]:
# If 'coerce', then invalid parsing will be set as NaN.in 8644 has a invalid value in this column.
df1['Reviews'] = pd.to_numeric(df1.Reviews,errors = 'coerce')

In [None]:
df1['Reviews'].dtypes

In [None]:
df1['Reviews'].describe().apply(lambda x: ("{:.2f}".format(x)))

In [None]:
sns.boxplot(data= df1['Reviews'])

In [None]:
transformer = FunctionTransformer(np.log1p)
df1['Reviews'] = transformer.fit_transform(df1.Reviews)

In [None]:
sns.boxplot(df1.Reviews)

### 9. The column 'Size' contains alphanumeric values, treat the non numeric data and convert the column into suitable data type. (hint: Replace M with 1 million and K with 1 thousand, and drop the entries where size='Varies with device')


In [None]:
df1.Size[df1['Size'] == 'Varies with device'].value_counts()

In [None]:
df1[df1['Size'] == 'Varies with device'].index

In [None]:
# drop the entries where size='Varies with device'
df1 = df1.drop(labels = df1[df1.Size == 'Varies with device'].index , axis=0)


In [None]:
# check for varies with device
df1.Size[df1['Size'] == 'Varies with device'].value_counts()


In [None]:
df2 = df1.copy()

In [None]:
# to treat million and thousand seperate the digit(size_num) and unit (size_unit)
df2[['Size_num','Size_unit']] = df2['Size'].str.extract(r'(\d+\.\d+|\d+)([A-Za-z]+)')

In [None]:
df2.head()

In [None]:
# to know what are the unique units to be treated.
df2['Size_unit'].unique()

In [None]:
# repace million with 10,00,000 and k with 1000,nan with 0
df2['Size_unit'] = df2['Size_unit'].replace({'M':1000000,'k':1000,np.nan:0})

In [None]:
df2.head()

In [None]:
df3 = df2.copy()

In [None]:
df3.Size_num = df3.Size_num.astype(float)

In [None]:
df3.Size_unit = df3.Size_unit.astype(float)

In [None]:
# size is the multiplication of number with its unit.now we get the size in float type.
df3['Size'] = (df3['Size_num']) * (df3['Size_unit'])

In [None]:
df3.head()

In [None]:
df3.info()

In [None]:
# we get size in float so need for size_num and size_unit
df3 = df3.drop(['Size_num','Size_unit'],axis=1)

In [None]:
df3.head()

### 10. Check the column 'Installs', treat the unwanted characters and convert the column into a suitable data type.


In [None]:
df3.Installs.sample(15)

In [None]:
# There are 3 unwanted characters in this column (+,Free)  so i repalce it with blank ,then only we can convert it into numerical
df3['Installs_Morethan'] = df3.Installs.str.replace('+','').str.replace(',','').str.replace('Free','0')

In [None]:
df3.head()

In [None]:
df3['Installs_Morethan'] = df3['Installs_Morethan'].astype(int)

In [None]:
df3.info()

In [None]:
df3.drop(['Installs'],axis=1,inplace=True)

In [None]:
df3.info()

### 11. Check the column 'Price' , remove the unwanted characters and convert the column into a suitable data type.

In [None]:
# check what are the unwanted characteristics
df3.Price.unique()

In [None]:
df3.Price = df3.Price.str.replace('$','').str.replace('Everyone','0')

In [None]:
df3['Price'] = df3['Price'].astype(float)

In [None]:
df3.info()

In [None]:
df4 = df3.copy()

In [None]:
df4.drop(['App', 'Rating' ,'Genres','Last Updated',
'Current Ver','Android Ver'],axis=1,inplace=True)

In [None]:
df4.info()

### 13. Encode the categorical columns.


In [None]:
df4.Category.unique()

In [None]:
df4.Type.unique()

In [None]:
df4['Content Rating'].unique()

In [None]:
df4.Rating_category.unique()

In [None]:
df_dum = df4.copy()

In [None]:
#so many unique value in category column ,it increase the dimension by using one hot encoder so i prefer label encoder 
# rating_category as the target so we cannot do one hot encoding ,it seperate it into two columns.and also it is ordinal data. 
labelencoder = LabelEncoder()  
df_dum['Category_label'] = labelencoder.fit_transform(df_dum.Category)
df_dum['Rating_category'] = labelencoder.fit_transform(df_dum.Rating_category)

In [None]:
df_dum.Rating_category.unique()# it takes 0 for high 1 for low

In [None]:
df_dum.drop(['Category'],axis=1,inplace=True)

In [None]:
df_dum.head()

In [None]:
df_dum = pd.get_dummies(df_dum,columns = ['Type','Content Rating'])

In [None]:
df_dum.head()

### 14. Segregate the target and independent features (Hint: Use Rating_category as the target)

In [None]:
x = df_dum.drop(['Rating_category'],axis=1)  #independent variables
y = df_dum['Rating_category']  #dependent variable



### 15. Split the dataset into train and test.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state= 0)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

### 16. Standardize the data, so that the values are within a particular range.

In [None]:
# using log transformation 
df_transform = transformer.fit_transform(df_dum)

In [None]:
df_transform.head()

In [None]:
# using min max scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_minmax = scaler.fit_transform(df_dum)

In [None]:
df_dum.head()