# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline

# Loading DataSet

In [3]:
df = pd.read_csv("CoffeeAndCodeLT2018 - CoffeeAndCodeLT2018.csv")
df

Unnamed: 0,CodingHours,CoffeeCupsPerDay,CoffeeTime,CodingWithoutCoffee,CoffeeType,CoffeeSolveBugs,Gender,Country,AgeRange
0,8,2,Before coding,Yes,Caffè latte,Sometimes,Female,Lebanon,18 to 29
1,3,2,Before coding,Yes,Americano,Yes,Female,Lebanon,30 to 39
2,5,3,While coding,No,Nescafe,Yes,Female,Lebanon,18 to 29
3,8,2,Before coding,No,Nescafe,Yes,Male,Lebanon,
4,10,3,While coding,Sometimes,Turkish,No,Male,Lebanon,18 to 29
...,...,...,...,...,...,...,...,...,...
95,6,2,Before coding,Yes,Nescafe,Yes,Male,Lebanon,18 to 29
96,4,1,Before coding,Sometimes,Nescafe,Sometimes,Female,Lebanon,18 to 29
97,10,3,Before coding,Yes,Cappuccino,Yes,Male,Lebanon,Under 18
98,2,2,While coding,Sometimes,Espresso (Short Black),Sometimes,Female,Lebanon,18 to 29


In [7]:
df.head()

Unnamed: 0,CodingHours,CoffeeCupsPerDay,CoffeeTime,CodingWithoutCoffee,CoffeeType,CoffeeSolveBugs,Gender,Country,AgeRange
0,8,2,Before coding,Yes,Caffè latte,Sometimes,Female,Lebanon,18 to 29
1,3,2,Before coding,Yes,Americano,Yes,Female,Lebanon,30 to 39
2,5,3,While coding,No,Nescafe,Yes,Female,Lebanon,18 to 29
3,8,2,Before coding,No,Nescafe,Yes,Male,Lebanon,
4,10,3,While coding,Sometimes,Turkish,No,Male,Lebanon,18 to 29


In [8]:
df.tail()

Unnamed: 0,CodingHours,CoffeeCupsPerDay,CoffeeTime,CodingWithoutCoffee,CoffeeType,CoffeeSolveBugs,Gender,Country,AgeRange
95,6,2,Before coding,Yes,Nescafe,Yes,Male,Lebanon,18 to 29
96,4,1,Before coding,Sometimes,Nescafe,Sometimes,Female,Lebanon,18 to 29
97,10,3,Before coding,Yes,Cappuccino,Yes,Male,Lebanon,Under 18
98,2,2,While coding,Sometimes,Espresso (Short Black),Sometimes,Female,Lebanon,18 to 29
99,10,4,Before coding,Sometimes,Double Espresso (Doppio),Sometimes,Male,Lebanon,18 to 29


In [9]:
df.dtypes

CodingHours             int64
CoffeeCupsPerDay        int64
CoffeeTime             object
CodingWithoutCoffee    object
CoffeeType             object
CoffeeSolveBugs        object
Gender                 object
Country                object
AgeRange               object
dtype: object

In [10]:
df.columns

Index(['CodingHours', 'CoffeeCupsPerDay', 'CoffeeTime', 'CodingWithoutCoffee',
       'CoffeeType', 'CoffeeSolveBugs', 'Gender', 'Country', 'AgeRange'],
      dtype='object')

In [11]:
df.size

900

In [12]:
df.shape

(100, 9)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   CodingHours          100 non-null    int64 
 1   CoffeeCupsPerDay     100 non-null    int64 
 2   CoffeeTime           100 non-null    object
 3   CodingWithoutCoffee  100 non-null    object
 4   CoffeeType           99 non-null     object
 5   CoffeeSolveBugs      100 non-null    object
 6   Gender               100 non-null    object
 7   Country              100 non-null    object
 8   AgeRange             98 non-null     object
dtypes: int64(2), object(7)
memory usage: 7.2+ KB


In [14]:
df.describe()

Unnamed: 0,CodingHours,CoffeeCupsPerDay
count,100.0,100.0
mean,6.41,2.89
std,2.644205,1.613673
min,1.0,1.0
25%,4.0,2.0
50%,7.0,2.5
75%,8.0,4.0
max,10.0,8.0


In [15]:
df.isnull().sum()

CodingHours            0
CoffeeCupsPerDay       0
CoffeeTime             0
CodingWithoutCoffee    0
CoffeeType             1
CoffeeSolveBugs        0
Gender                 0
Country                0
AgeRange               2
dtype: int64

In [16]:
df.duplicated().sum()

3

In [17]:
df.skew()

  df.skew()


CodingHours        -0.271667
CoffeeCupsPerDay    1.256421
dtype: float64

In [18]:
df.corr()

Unnamed: 0,CodingHours,CoffeeCupsPerDay
CodingHours,1.0,0.313692
CoffeeCupsPerDay,0.313692,1.0


# Basic Data Cleaning

In [19]:
# Since there are less number of null values we can drop those rows
df.dropna(inplace =True)

In [20]:
df

Unnamed: 0,CodingHours,CoffeeCupsPerDay,CoffeeTime,CodingWithoutCoffee,CoffeeType,CoffeeSolveBugs,Gender,Country,AgeRange
0,8,2,Before coding,Yes,Caffè latte,Sometimes,Female,Lebanon,18 to 29
1,3,2,Before coding,Yes,Americano,Yes,Female,Lebanon,30 to 39
2,5,3,While coding,No,Nescafe,Yes,Female,Lebanon,18 to 29
4,10,3,While coding,Sometimes,Turkish,No,Male,Lebanon,18 to 29
5,8,2,While coding,Sometimes,Nescafe,Yes,Male,Lebanon,30 to 39
...,...,...,...,...,...,...,...,...,...
95,6,2,Before coding,Yes,Nescafe,Yes,Male,Lebanon,18 to 29
96,4,1,Before coding,Sometimes,Nescafe,Sometimes,Female,Lebanon,18 to 29
97,10,3,Before coding,Yes,Cappuccino,Yes,Male,Lebanon,Under 18
98,2,2,While coding,Sometimes,Espresso (Short Black),Sometimes,Female,Lebanon,18 to 29


# Data Visualisation Using Autoviz

In [None]:
! pip install Autoviz

In [None]:
! pip install xlrd

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class
AV = AutoViz_Class()
df_av = AV.AutoViz("../input/coffee-and-code-dataset/CoffeeAndCodeLT2018 - CoffeeAndCodeLT2018.csv")

# Data Visualisation

In [None]:
df['CodingHours'].value_counts()

In [None]:
sns.countplot(x = 'CodingHours',data = df)
plt.show()

#### Most of the people code for 8 hrs

In [None]:
df['CoffeeCupsPerDay'].value_counts()

In [None]:
sns.countplot(x = 'CoffeeCupsPerDay',data = df)
plt.show()

#### Most of the people drink 2 coffees a day

In [None]:
df['CoffeeTime'].value_counts()

In [None]:
sns.set(rc={'figure.figsize':(14,14)})
sns.countplot(x = 'CoffeeTime',data = df)
plt.show()

#### Most of people drink coffee While coding

In [None]:
df['CodingWithoutCoffee'].value_counts()

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
sns.countplot(x = 'CodingWithoutCoffee',data = df)
plt.show()

#### Most of people sometimes code without coffee

In [None]:
df['CoffeeType'].value_counts()

In [None]:
sns.set(rc={'figure.figsize':(15,15)})
sns.countplot(x = 'CoffeeType',data = df)
plt.show()

#### Most of people drink NEscafe coffee

In [None]:
df['CoffeeSolveBugs'].value_counts()

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
sns.countplot(x = 'CoffeeSolveBugs',data = df)
plt.show()

#### Most of the people sometimes solved bugs while drinking coffee

In [None]:
df['Gender'].value_counts()

In [None]:
sns.countplot(x = 'Gender',data = df)
plt.show()

#### Data consists most of males

In [None]:
df['Country'].value_counts()

In [None]:
sns.countplot(x = 'Country',data = df)
plt.show()

#### This data is from only one country i.e., LEbanon

In [None]:
df['AgeRange'].value_counts()

In [None]:
sns.countplot(x = 'AgeRange',data = df)
plt.show()

#### Most of people who code are between age 18-29

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
data=df.copy()
data.groupby('CodingWithoutCoffee')['CodingHours'].mean().plot.bar()
plt.xlabel('CodingWithoutCoffee')
plt.ylabel('CodingHours')
plt.title('CodingWithoutCoffee')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
data=df.copy()
data.groupby('CoffeeType')['CodingHours'].mean().plot.bar()
plt.xlabel('CoffeeType')
plt.ylabel('CodingHours')
plt.title('CoffeeType')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
data=df.copy()
data.groupby('CoffeeSolveBugs')['CodingHours'].mean().plot.bar()
plt.xlabel('CoffeeSolveBugs')
plt.ylabel('CodingHours')
plt.title('CoffeeSolveBugs')
plt.show()

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
data=df.copy()
data.groupby('Gender')['CodingHours'].mean().plot.bar()
plt.xlabel('Gender')
plt.ylabel('CodingHours')
plt.title('Gender')
plt.show()

# Male code more than females

In [None]:
sns.set(rc={'figure.figsize':(6,6)})
data=df.copy()
data.groupby('AgeRange')['CodingHours'].mean().plot.bar()
plt.xlabel('AgeRange')
plt.ylabel('CodingHours')
plt.title('AgeRange')
plt.show()

# under 18 age people code more

In [None]:
df1 = df.groupby('CoffeeTime').agg({'CoffeeCupsPerDay' :'mean'})
df1

In [None]:
px.bar(data_frame=df1, barmode='group',
       title = "<b>Coffee Time wise Analyzing</b>",template="plotly_dark")

#### In the morning and before coding people will not drink more cups of coffee

In [None]:
df2 = df.groupby('CodingWithoutCoffee').agg({'CodingHours':'mean','CoffeeCupsPerDay' :'mean'})
df2

In [None]:
px.bar(data_frame=df2, barmode='group',
       title = "<b>Coffee Time wise Analyzing</b>",template="plotly_dark")

#### Most of the people code with coffee

In [None]:
df3 = df.groupby('CoffeeType').agg({'CodingHours':'mean','CoffeeCupsPerDay' :'mean'})
df3

In [None]:
px.bar(data_frame=df3, barmode='group',
       title = "<b>Coffee Time wise Analyzing</b>",template="plotly_dark")

#### Most of the people drink Double Espresso (Doppio) about average of 5 cups a day while coding

In [None]:
df4 = df.groupby('CoffeeSolveBugs').agg({'CodingHours':'mean','CoffeeCupsPerDay' :'mean'})
df4

In [None]:
px.bar(data_frame=df4, barmode='group',
       title = "<b>Coffee Time wise Analyzing</b>",template="plotly_dark")

#### Most of the times coffee solve bugs

In [None]:
df5 = df.groupby('Gender').agg({'CodingHours':'mean','CoffeeCupsPerDay' :'mean'})
df5

In [None]:
px.bar(data_frame=df5, barmode='group',
       title = "<b>Coffee Time wise Analyzing</b>",template="plotly_dark")

#### Male code more and males drink more coffe cups per day

In [None]:
df6 = df.groupby('AgeRange').agg({'CodingHours':'mean','CoffeeCupsPerDay' :'mean'})
df6

In [None]:
px.bar(data_frame=df6, barmode='group',
       title = "<b>Coffee Type wise Analyzing</b>",template="plotly_dark")

#### Under 18 age code more hours and between age 40-49 drink more cups of coffee per day

In [None]:
px.bar(data_frame=df, x = 'CoffeeType' ,y = 'CoffeeCupsPerDay',color = 'Gender',
       title = "<b>Coffee Type wise Analyzing</b>",template="plotly_dark")

#### Americano is only drink by females
#### Double Espresso, cappuccino is only drink by male

In [None]:
px.bar(data_frame=df, x = 'CoffeeType' ,y = 'CoffeeCupsPerDay',color = 'AgeRange',
       title = "<b>Coffee Type wise Analyzing</b>",template="plotly_dark")

#### Americano is only drink by people between age 30-39
#### caffe latte is only drink by people between age 18-29
#### people between age 18-29 drinks all types of coffee expect americano
#### Under age 18 people drink only cappuccino
#### Nescafe coffee is only drink by people between age 18-39

In [None]:
px.bar(data_frame=df, x = 'CoffeeType' ,y = 'CoffeeCupsPerDay',color = 'CodingWithoutCoffee',
       title = "<b>Coffee Type wise Analyzing</b>",template="plotly_dark")

#### Double expresso is the only coffee drink by people while coding

# (IMPORTANT) Advanced Visualisation

## Getting unique values of each category

In [None]:
from IPython.core.display import HTML

def multi_table(table_list):
    ''' Acceps a list of IpyTable objects and returns a table which contains each IpyTable in a cell
    '''
    return HTML(
        '<table><tr style="background-color:white;">' + 
        ''.join(['<td>' + table._repr_html_() + '</td>' for table in table_list]) +
        '</tr></table>')

In [None]:
df_nunique = {var: pd.DataFrame(df[var].value_counts()) 
              for var in {'CoffeeTime', 'CodingWithoutCoffee',
       'CoffeeType', 'CoffeeSolveBugs', 'Gender', 'Country', 'AgeRange'}}
multi_table([df_nunique['CoffeeTime'], df_nunique['CodingWithoutCoffee'],df_nunique['CoffeeType'],df_nunique['CoffeeSolveBugs']
            ,df_nunique['Gender'],df_nunique['Gender'],df_nunique['AgeRange']])

In [None]:
df_groupby = {var: pd.DataFrame(df.groupby([var, 'CodingHours']).size()) 
              for var in {'CoffeeTime', 'CodingWithoutCoffee',
       'CoffeeType', 'CoffeeSolveBugs', 'Gender', 'Country', 'AgeRange'}}
multi_table([df_groupby['CoffeeTime'], df_groupby['CodingWithoutCoffee'],df_groupby['CoffeeType'],df_groupby['CoffeeSolveBugs']
            ,df_groupby['Gender'],df_groupby['Gender'],df_groupby['AgeRange']])

In [None]:
df_groupby = {var: pd.DataFrame(df.groupby([var, 'CoffeeCupsPerDay']).size()) 
              for var in {'CoffeeTime', 'CodingWithoutCoffee',
       'CoffeeType', 'CoffeeSolveBugs', 'Gender', 'Country', 'AgeRange'}}
multi_table([df_groupby['CoffeeTime'], df_groupby['CodingWithoutCoffee'],df_groupby['CoffeeType'],df_groupby['CoffeeSolveBugs']
            ,df_groupby['Gender'],df_groupby['Gender'],df_groupby['AgeRange']])