In [1]:
# Importing the Pandas library.
import pandas as pd

In [2]:
# Saving the file path to the dataset.
path = "Customer Purchasing Behaviors.csv"

# Loading the data into a dataframe.
df = pd.read_csv(path)

In [3]:
# Displaying the first 5 rows of the DataFrame.
df.head()

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency
0,1,25,45000,200,4.5,North,12
1,2,34,55000,350,7.0,South,18
2,3,45,65000,500,8.0,West,22
3,4,22,30000,150,3.0,East,10
4,5,29,47000,220,4.8,North,13


In [4]:
# Displaying the last 5 rows of the DataFrame.
df.tail()

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency
233,234,40,60000,450,7.2,West,20
234,235,38,59000,430,6.9,North,20
235,236,54,74000,630,9.4,South,27
236,237,32,52000,360,5.8,West,18
237,238,31,51000,340,5.6,North,17


In [26]:
# Displaying the first and the last 5 rows of the DataFrame.
df

Unnamed: 0,user_id,age,annual_income,purchase_amount,loyalty_score,region,purchase_frequency
0,1,25,45000,200,4.5,North,12
1,2,34,55000,350,7.0,South,18
2,3,45,65000,500,8.0,West,22
3,4,22,30000,150,3.0,East,10
4,5,29,47000,220,4.8,North,13
...,...,...,...,...,...,...,...
233,234,40,60000,450,7.2,West,20
234,235,38,59000,430,6.9,North,20
235,236,54,74000,630,9.4,South,27
236,237,32,52000,360,5.8,West,18


# DataFrame attributes

## .shape attribute

In [5]:
# Shape of the dataset. First value is the row count, second value is the column count.
df_shape = df.shape
df_shape

(238, 7)

In [6]:
# Row count.
df_row_count = df_shape[0]
df_row_count

238

In [7]:
# Column count.
df_column_count = df_shape[1]
df_column_count

7

## .columns attribute

In [8]:
# Dataset column names.
df_columns = df.columns
df_columns

Index(['user_id', 'age', 'annual_income', 'purchase_amount', 'loyalty_score',
       'region', 'purchase_frequency'],
      dtype='object')

In [9]:
# Saving the column names as a list.
df_columns_list = list(df.columns)
df_columns_list

['user_id',
 'age',
 'annual_income',
 'purchase_amount',
 'loyalty_score',
 'region',
 'purchase_frequency']

In [10]:
# Saving the column names as a set.
df_columns_set = set(df.columns)
df_columns_set

{'age',
 'annual_income',
 'loyalty_score',
 'purchase_amount',
 'purchase_frequency',
 'region',
 'user_id'}

## .size attribute

In [11]:
# Size of the DataFrame. Size is the product of row count and column count
df_size = df.size
df_size

1666

## .dtypes attribute

In [12]:
# Data type for each column, saved as a pandas.Series.
df_dtypes = df.dtypes
df_dtypes

user_id                 int64
age                     int64
annual_income           int64
purchase_amount         int64
loyalty_score         float64
region                 object
purchase_frequency      int64
dtype: object

In [13]:
# Data types in a list. Every element is a numpy.dtype object.
df_dtypes_list = list(df_dtypes)
df_dtypes_list

[dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('int64'),
 dtype('float64'),
 dtype('O'),
 dtype('int64')]

## .empty attribute 

In [14]:
# Returns True if the DataFrame is empty, else False.
df.empty

False

# Selecting subsets of a DataFrame

## Selecting a single column

In [16]:
# Selecting the column user_id using a one element list.
user_id = df["user_id"]
user_id

0        1
1        2
2        3
3        4
4        5
      ... 
233    234
234    235
235    236
236    237
237    238
Name: user_id, Length: 238, dtype: int64

In [17]:
# Selecting the column age using the dot operator.
age = df.age
age

0      25
1      34
2      45
3      22
4      29
       ..
233    40
234    38
235    54
236    32
237    31
Name: age, Length: 238, dtype: int64

In [18]:
# Selecting the column annual_income using iloc. 
annual_income = df.iloc[:, 2]
annual_income

0      45000
1      55000
2      65000
3      30000
4      47000
       ...  
233    60000
234    59000
235    74000
236    52000
237    51000
Name: annual_income, Length: 238, dtype: int64

In [19]:
# Selecting the column annual_income using iloc, diffrent notation. 
annual_income_2 = df.iloc[0:df.shape[0], 2]
annual_income_2

0      45000
1      55000
2      65000
3      30000
4      47000
       ...  
233    60000
234    59000
235    74000
236    52000
237    51000
Name: annual_income, Length: 238, dtype: int64

In [20]:
# Selecting the column annual_income using loc. 
purchase_amount = df.loc[:, "purchase_amount"]
purchase_amount

0      200
1      350
2      500
3      150
4      220
      ... 
233    450
234    430
235    630
236    360
237    340
Name: purchase_amount, Length: 238, dtype: int64

In [21]:
# Selecting the other columns.
loyalty_score = df["loyalty_score"]
region = df["region"]
purchase_frequency = df["purchase_frequency"]

# Pandas describe() function

In [22]:
# Using the Pandas describe() function to get some summary statistics for all numerical columns in the DataFrame.
# Columns with not numerical data will be ignored.
df_describe = df.describe()
print(df_describe)

# describe() returns a Pandas DataFrame or a Pandas Series. This means you can access the summary statistic for every
# numerical column as follows:
print(df_describe["user_id"])  # summary statistics for the column user_id.
print(df_describe["age"])  # summary statistics for the column age.
print(df_describe["annual_income"])  # summary statistics for the column annual_income.
print(df_describe["purchase_amount"])  # summary statistics for the column purchase_amount.
print(df_describe["loyalty_score"])  # summary statistics for the column loyalty_score.
print(df_describe["purchase_frequency"])  # summary statistics for the column purchase_frequency.

# You can also access every statistics for each numerical column. Here an example for the column age.
print("The count of the column age is:", df_describe.iloc[0, 1])
print("The mean of the column age is:", df_describe.iloc[1, 1])
print("The standard deviation of the column age is:", df_describe.iloc[2, 1])
print("The minimum of the column age is:", df_describe.iloc[3, 1])
print("The 25 percentile of the column age is:", df_describe.iloc[4, 1])
print("The 50 percentile of the column age is:", df_describe.iloc[5, 1])
print("The 75 percentile of the column age is:", df_describe.iloc[6, 1])
print("The maximum of the column age is:", df_describe.iloc[7, 1])

# You can decide what percentiles you want to display.
df_describe_percentiles = df.describe(percentiles=[0.1, 0.2, 0.3, 0.4, 0.5])
print(df_describe_percentiles)

          user_id         age  annual_income  purchase_amount  loyalty_score  \
count  238.000000  238.000000     238.000000       238.000000     238.000000   
mean   119.500000   38.676471   57407.563025       425.630252       6.794118   
std     68.848868    9.351118   11403.875717       140.052062       1.899047   
min      1.000000   22.000000   30000.000000       150.000000       3.000000   
25%     60.250000   31.000000   50000.000000       320.000000       5.500000   
50%    119.500000   39.000000   59000.000000       440.000000       7.000000   
75%    178.750000   46.750000   66750.000000       527.500000       8.275000   
max    238.000000   55.000000   75000.000000       640.000000       9.500000   

       purchase_frequency  
count          238.000000  
mean            19.798319  
std              4.562884  
min             10.000000  
25%             17.000000  
50%             20.000000  
75%             23.000000  
max             28.000000  
count    238.000000
mean   

# Statistical functions in Pandas.

In [23]:
# Computing some statistics for the numerical column annual_income.
annual_income_sum = annual_income.sum()
print(annual_income_sum)

annual_income_mean = annual_income.mean()
print(annual_income_mean)

annual_income_min = annual_income.min()
print(annual_income_min)

annual_income_max = annual_income.max()
print(annual_income_max)

annual_income_median = annual_income.median()
print(annual_income_median)

annual_income_varianz = annual_income.var()
print(annual_income_varianz)

annual_income_standard_deviation = annual_income.std()
print(annual_income_standard_deviation)

annual_income_count = annual_income.count()
print(annual_income_count)

13663000
57407.56302521008
30000
75000
59000.0
130048381.3778676
11403.875717398343
238
