In [None]:
# Importing libraries
import pandas as pd
import numpy as np

# Read csv file into a pandas dataframe
df = pd.read_csv('BlackFriday.csv', header=None)
df.columns = ['User_ID', 'Lat', 'Long', 'Product_ID', 'Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1', 'Product_Category_2', 'Product_Category_3', 'Purchase']
df.drop(index=df.index[0], axis=0, inplace=True) # Remove first row of data

# Create a copy of our data frame to work with
data = df.copy()

# Print summary statistics
print('The dataset has {} rows and {} columns'.format(data.shape[0], data.shape[1]))
print(data.describe())
print(data.head())
print(data.dtypes.value_counts())

In [None]:
data['Lat'] = pd.to_numeric(data['Lat'], errors='coerce') # Convert Lat attribute to numeric
data['Long'] = pd.to_numeric(data['Long'], errors='coerce') # Convert Long attribute to numeric
data['Purchase'] = pd.to_numeric(data['Purchase'], errors='coerce') # Convert Purchase attribute to numeric

print(data.dtypes.value_counts())

In [None]:
numeric_attributes = data.columns[data.dtypes != "object"]
categorical_attributes = data.columns[data.dtypes == "object"]

print(numeric_attributes)
print(categorical_attributes)

In [None]:
# Print the total number of missing values
print("There are {} missing values in this dataset".format(data.isnull().sum().sum()))

print('Number of instances = %d' % (data.shape[0]))
print('Number of attributes = %d' % (data.shape[1]))

print('Number of missing values:')
for col in data.columns:
    print('\t%s: %d' % (col, data[col].isna().sum()))

In [None]:
# Drop rows which contain any NaN values. Note we are making a copy of the dataframe here
modified_data = data.dropna()

# Print the total number of missing values
print("There are {} missing values in this dataset".format(modified_data.isnull().sum().sum()))

print('Number of instances = %d' % (modified_data.shape[0]))
print('Number of attributes = %d' % (modified_data.shape[1]))

print('Number of missing values:')
for col in modified_data.columns:
    print('\t%s: %d' % (col, modified_data[col].isna().sum()))

In [None]:
data2 = data['Product_Category_2']
data3 = data['Product_Category_3']

print('Before replacing missing values:')
print(data2[20:25])
print(data3[20:25])
data2 = data2.fillna(data2.median())
data3 = data3.fillna(data3.median())

print('\nAfter replacing missing values:')
print(data2[20:25])
print(data3[20:25])


In [None]:
# Replace all missing values with user defined constant
print('Before replacing missing values:')
print(data[20:25])

data4 = data.copy()

data4 = data4.replace(np.NaN , "?")

print('\nAfter replacing missing values:')
print(data4[20:25])

In [None]:
data5 = data.copy()

print('Number of missing values:')
for col in data5.columns:
    print('\t%s: %d' % (col,data5[col].isna().sum()))

data5.drop('Product_Category_2', axis=1, inplace=True) # axis=0 for rows, axis=1 for columns
data5.drop('Product_Category_3', axis=1, inplace=True) # axis=0 for rows, axis=1 for columns

print('Number of missing values after removal:')
for col in data5.columns:
    print('\t%s: %d' % (col, data5[col].isna().sum()))

In [None]:
%matplotlib inline

data['Purchase'].hist(bins=10)
data['Purchase'].value_counts(sort=False)

In [None]:
bins = pd.cut(data['Purchase'], 4)
bins.value_counts(sort=False)

data6 = data.groupby(bins)
data6['Purchase'].hist()

In [None]:
bins = pd.qcut(data['Purchase'], 4)
bins.value_counts(sort=False)

data7 = data.groupby(bins)
data7['Purchase'].hist()

In [None]:
data8 = data[['User_ID', 'Lat', 'Long', 'Purchase']]
print(data8)

In [None]:
sample = data.sample(frac=0.01, random_state=1)
print(sample)

In [None]:
sample = data.sample(frac=0.01, replace=True, random_state=1)
print(sample)

In [None]:
from sklearn.datasets import load_breast_cancer

dataset = load_breast_cancer() # Load dataset from sklearn library

df = pd.DataFrame(dataset['data'], columns=dataset['feature_names'])

df.head(5)

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df)
scaled_data = scaler.transform(df)
print(scaled_data)

In [None]:
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

pca = PCA(n_components=2) # Reduce the dimensions from 30 to 2 dimensions
pca.fit(scaled_data)
x_pca = pca.transform(scaled_data)

plt.figure(figsize=(8,6))
plt.scatter(x_pca[:,0], x_pca[:,1], c=dataset['target'])
plt.xlabel('First principle component')
plt.ylabel('Second principle component')


In [None]:
data9 = data[['Marital_Status', 'Purchase']]
data9.head()

In [None]:
from sklearn.preprocessing import MaxAbsScaler

# Create an abs_scaler object
abs_scaler = MaxAbsScaler()

# Calculate the maximum absolute value for scaling the data using the fit method
abs_scaler.fit(data9)

# The maximum absolute values calculated by the fit method
abs_scaler.max_abs_
# array([4.0e+05, 1.7e+01])

# Transform the data using the parameters calculated by the fit method (the maximum absolute values)
scaled_data = abs_scaler.transform(data9)

# Store the results in a data frame
df_scaled = pd.DataFrame(scaled_data, columns=data9.columns)

# Visualize the data frame
print(df_scaled)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# create a scaler object
scaler = MinMaxScaler()

# fit and transform the data
df_norm = pd.DataFrame(scaler.fit_transform(data9), columns=data9.columns)

print(df_norm)

In [None]:
from sklearn.preprocessing import StandardScaler

# create a scaler object
std_scaler = StandardScaler()
std_scaler

# fit and transform the data
df_std = pd.DataFrame(std_scaler.fit_transform(data9), columns=data9.columns)

print(df_std)