In [None]:
!pip install pandas

# PANDAS

__Pandas__ is a powerful _open-source_ data manipulation and analysis library for _Python_. 

It provides data structures and functions for efficiently __handling and analyzing structured data__, such as tables or spreadsheets.

With __pandas__, you can easily _load_, _manipulate_, _analyze data_, perform _data cleaning_ and _preprocessing_ tasks, and create _visualizations_.

It is widely used in _data science_, _machine learning_, and _data analysis_ projects.

To import the pandas library and assigns it the alias 'pd', you could make `import pandas as pd`.

## The Series Data Structure

A __pandas Series__ is a _one-dimensional labeled array_ capable of holding any data type. It is similar to a _column_ in a spreadsheet or a SQL table, or a _dictionary-like_ object. It is a fundamental _data structure_ in __pandas__ library, which is widely used for data manipulation and analysis in Python.

A __pandas Series__ consists of two main components: the _data_ and the _index_. The _data_ can be of any type, such as integers, floats, strings, or even complex objects. The _index_ is a sequence of labels that uniquely identifies each element in the Series.

Some key features of pandas Series include:
- Vectorized operations: Series supports vectorized operations, allowing you to perform element-wise computations efficiently.
- Label-based indexing: You can access elements in a Series using labels instead of integer-based indexing.
- Alignment: Series automatically aligns data based on the index, making it easy to perform operations on multiple Series with different indexes.

To create a __Series__, you can pass a list, array, or dictionary-like object to the `pd.Series()` constructor. You can also specify custom index labels if needed.

In [None]:
import pandas as pd

# Create a Series object from a list of strings
list_strings = ['a', 'b', 'c', 'd', 'e']
serie_1 = pd.Series(list_strings)
print("Serie 1:\n", serie_1)

# Create a Series object from a list of numbers
list_numbers = [1, 2, 3, 4, 5]
serie_2 = pd.Series(list_numbers)
print("\nSerie 2:\n", serie_2)

# Create a Series object from a list of numbers with a None value
list_numbers_with_none = [1, 2, None, 4, 5]
serie_3 = pd.Series(list_numbers_with_none)
print("\nSerie 3:\n", serie_3)

In [5]:
# Create a Series object from a dictionary
dict_data = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
serie_4 = pd.Series(dict_data)
print("Serie 4:\n", serie_4)

# Get the values of the Series index
print("Serie 4 index:", serie_4.index)

Serie 4:
 a    1
b    2
c    3
d    4
e    5
dtype: int64
Serie 4 index: Index(['a', 'b', 'c', 'd', 'e'], dtype='object')


In [6]:
# Create a series object from a list of tuple pairs
list_tuples = [('a', 1), ('b', 2), ('c', 3), ('d', 4), ('e', 5)]
serie_5 = pd.Series(list_tuples)
print("Serie 5:", serie_5)

Serie 5: 0    (a, 1)
1    (b, 2)
2    (c, 3)
3    (d, 4)
4    (e, 5)
dtype: object


In [7]:
# Create a series object from a list as values and a list as index
list_index = ['a', 'b', 'c', 'd', 'e']
list_values = [1, 2, 3, 4, 5]
serie_6 = pd.Series(list_values, index=list_index)
print("Serie 6:\n", serie_6)

for index, value in serie_6.items():
    print(f"Index: {index}, Value: {value}")

Serie 6:
 a    1
b    2
c    3
d    4
e    5
dtype: int64
Index: a, Value: 1
Index: b, Value: 2
Index: c, Value: 3
Index: d, Value: 4
Index: e, Value: 5


In [9]:
# Query a Series object by boolean indexing
print("Serie 6 > 2:\n",serie_6[serie_6 > 2])
print("-"*10)
print(serie_6 > 2)

Serie 6 > 2:
 c    3
d    4
e    5
dtype: int64
----------
a    False
b    False
c     True
d     True
e     True
dtype: bool


In [11]:
# Query a Series object by fancy indexing
print("Serie 6[['a', 'b']]:\n", serie_6[['a', 'b']])

Serie 6[['a', 'b']]:
 a    1
b    2
dtype: int64


In [14]:
# Query a Series object using loc[]
print("Serie 6.loc[['a', 'b']]:\n", serie_6.loc[['a', 'b']])

Serie 6.loc[['a', 'b']]:
 a    1
b    2
dtype: int64


In [15]:
# Query a Series object using iloc[]
print("Serie 6.iloc[0:3]:\n", serie_6.iloc[0:3])

Serie 6.iloc[0:3]:
 a    1
b    2
c    3
dtype: int64


## The DataFrame Data Structure

A __pandas DataFrame__ is a _two-dimensional_, _labeled_ data structure in _Python_ that is commonly used for _data manipulation and analysis_. It consists of _rows_ and _columns_, similar to a table in a relational database.

The __DataFrame__ can store _heterogeneous data types_ and provides various operations and functions to perform data manipulation, filtering, grouping, and statistical analysis.

To access and manipulate the data in the __DataFrame__, you can use various _methods_ and _attributes_ provided by the __pandas__ library.

For more information on __pandas DataFrame__, refer to the [official pandas documentation](https://pandas.pydata.org/docs/reference/frame.html).

In [17]:
# create dataframes from lists
list_example = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
df = pd.DataFrame(list_example)
print("Dataframe from list:\n", df)
print("Dataframe from list shape:", df.shape)
print("Dataframe from list columns:", df.columns)
print("Dataframe from list index:", df.index)

Dataframe from list:
    0  1  2
0  1  2  3
1  4  5  6
2  7  8  9
Dataframe from list shape: (3, 3)
Dataframe from list columns: RangeIndex(start=0, stop=3, step=1)
Dataframe from list index: RangeIndex(start=0, stop=3, step=1)


In [18]:
df

Unnamed: 0,0,1,2
0,1,2,3
1,4,5,6
2,7,8,9


In [19]:
# create a dataframe from a list of dictionaries
list_dict = [{'a': 1, 'b': 2, 'c': 3}, {'a': 4, 'b': 5, 'c': 6}, {'a': 7, 'b': 8, 'c': 9}]
df = pd.DataFrame(list_dict)
print("Dataframe from list of dictionaries:\n", df)
print("Dataframe from list columns:", df.columns)
print("Dataframe from list index:", df.index)

Dataframe from list of dictionaries:
    a  b  c
0  1  2  3
1  4  5  6
2  7  8  9
Dataframe from list columns: Index(['a', 'b', 'c'], dtype='object')
Dataframe from list index: RangeIndex(start=0, stop=3, step=1)


In [15]:
# create a dataframe from a csv file
df_csv = pd.read_csv('StudentsInfo.csv')
df_csv.head()


Unnamed: 0,Name,Company,Position,Salary
0,Alice Johnson,"Hernandez, Griffith and Nelson",Petroleum engineer,4740
1,David Jones,Gomez-Garcia,"Geologist, engineering",73329
2,Eva Brown,Blevins LLC,Microbiologist,83245
3,Frank Davis,Greene-Wilson,Museum education officer,74390
4,Jack Anderson,Butler PLC,"Scientist, research (maths)",69851


In [2]:

import pandas as pd
# create a dataframe from a json file
df_json = pd.read_json('StudentsInfo.json')
print(df_json.head())

   id            name                  career                college
0   1   Alice Johnson        Computer Science        Tech University
1   2       Bob Smith  Mechanical Engineering  Engineering Institute
2   3  Carol Williams  Electrical Engineering        Tech University
3   4     David Jones                 Biology        Science College
4   5       Eva Brown                 Physics        Tech University


In [4]:
# describe a dataframe
print("Dataframe csv describe:\n", df_csv.describe())
print("*"*10)
print("Dataframe json describe:\n", df_json.describe())

Dataframe csv describe:
        #Passengers
count   144.000000
mean    280.298611
std     119.966317
min     104.000000
25%     180.000000
50%     265.500000
75%     360.500000
max     622.000000
**********
Dataframe json describe:
              id
count  50.00000
mean   25.50000
std    14.57738
min     1.00000
25%    13.25000
50%    25.50000
75%    37.75000
max    50.00000


In [5]:
# get information about a dataframe
print("Dataframe csv info:\n", df_csv.info())
print("="*50)
print("Dataframe json info:\n", df_json.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Month        144 non-null    object
 1   #Passengers  144 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 2.4+ KB
Dataframe csv info:
 None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       50 non-null     int64 
 1   name     50 non-null     object
 2   career   50 non-null     object
 3   college  50 non-null     object
dtypes: int64(1), object(3)
memory usage: 1.7+ KB
Dataframe json info:
 None


In [None]:
# indexes and columns
df_changed_index = df_csv.set_index('Name')
print("Dataframe csv changed index:\n", df_changed_index.head())
print("*"*50)
print("Dataframe csv changed index columns:\n", df_changed_index.columns)
print("Dataframe csv changed index index:\n", df_changed_index.index)

In [None]:
# rename columns
df_renamed = df_csv.rename(columns={'Salary': 'Salary (USD/year)'})
df_renamed.head()

## Using Datetime into Pandas

In [6]:
# converting a column to datetime with to_datetime()
sales_df = pd.read_json('sales_data.json')
print("Sales dataframe:\n", sales_df.info())
print(sales_df[sales_df['code'] == "Sale-4594-TuGI"])
sales_df = sales_df.replace('2008-Dic-23', '2008-12-23')
print("*"*50)
sales_df['date'] = pd.to_datetime(sales_df['date'])
print("Sales dataframe with Date column as datetime:\n", sales_df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   code         1000 non-null   object 
 1   client       1000 non-null   object 
 2   total_price  1000 non-null   float64
 3   date         1000 non-null   object 
 4   hour         1000 non-null   object 
 5   credit_card  1000 non-null   int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 47.0+ KB
Sales dataframe:
 None
              code         client  total_price         date      hour  \
24  Sale-4594-TuGI  Patrick Meyer       150.19  2008-Dic-23  07:32:54   

     credit_card  
24  569507527095  
**************************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   code         1000 non-null   

In [7]:
# converting a column from datetime to string with strftime()
sales_df['date_full'] = sales_df['date'].dt.strftime('%Y-%b-%d') + " " + sales_df['hour']
print(sales_df.head())

             code           client  total_price       date      hour  \
0  Sale-1117-HdZH        Gary Meza       832.48 2024-06-03  08:13:18   
1  Sale-5078-hqkc     Carol Martin       156.29 2014-08-22  02:57:20   
2  Sale-8209-xGVn   Jeremy Spencer       832.05 1979-05-22  02:10:06   
3  Sale-9093-bfcp  Pamela Anderson       166.57 1973-12-24  21:28:34   
4  Sale-8141-KGOb    Kenneth Marsh       498.43 1974-03-14  03:55:36   

        credit_card             date_full  
0  6011811575065598  2024-Jun-03 08:13:18  
1  6536303182814044  2014-Aug-22 02:57:20  
2   213185615148626  1979-May-22 02:10:06  
3  3558512811558836  1973-Dec-24 21:28:34  
4  2239583806605394  1974-Mar-14 03:55:36  


In [8]:
# converting a column from datetime to a timestamp with timestamp()
sales_df['timestamp'] = pd.to_datetime(sales_df['date_full']).apply(lambda x: x.timestamp())
print(sales_df.head())
sales_df['timestamp'] = sales_df['timestamp'].astype('int64')
print("="*50)
print(sales_df.head())


             code           client  total_price       date      hour  \
0  Sale-1117-HdZH        Gary Meza       832.48 2024-06-03  08:13:18   
1  Sale-5078-hqkc     Carol Martin       156.29 2014-08-22  02:57:20   
2  Sale-8209-xGVn   Jeremy Spencer       832.05 1979-05-22  02:10:06   
3  Sale-9093-bfcp  Pamela Anderson       166.57 1973-12-24  21:28:34   
4  Sale-8141-KGOb    Kenneth Marsh       498.43 1974-03-14  03:55:36   

        credit_card             date_full     timestamp  
0  6011811575065598  2024-Jun-03 08:13:18  1.717402e+09  
1  6536303182814044  2014-Aug-22 02:57:20  1.408676e+09  
2   213185615148626  1979-May-22 02:10:06  2.961870e+08  
3  3558512811558836  1973-Dec-24 21:28:34  1.256165e+08  
4  2239583806605394  1974-Mar-14 03:55:36  1.324653e+08  
             code           client  total_price       date      hour  \
0  Sale-1117-HdZH        Gary Meza       832.48 2024-06-03  08:13:18   
1  Sale-5078-hqkc     Carol Martin       156.29 2014-08-22  02:57:20   
2  

In [None]:
print(sales_df.describe())

In [9]:
%%timeit -n 100
import numpy as np

print(np.round(np.sum(sales_df['total_price']), 3))

489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
4

In [10]:
%%timeit -n 100
import numpy as np
total = 0
for price in sales_df['total_price']:
    total += price
print(np.round(total, 3))

489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
489713.8
4

## Queries and Transformations

In [11]:
# drop a column
sales_copy_df = sales_df.copy()
print(sales_copy_df.columns)
sales_copy_df = sales_copy_df.drop(columns=['timestamp'], axis=1)
sales_copy_df.head()

Index(['code', 'client', 'total_price', 'date', 'hour', 'credit_card',
       'date_full', 'timestamp'],
      dtype='object')


Unnamed: 0,code,client,total_price,date,hour,credit_card,date_full
0,Sale-1117-HdZH,Gary Meza,832.48,2024-06-03,08:13:18,6011811575065598,2024-Jun-03 08:13:18
1,Sale-5078-hqkc,Carol Martin,156.29,2014-08-22,02:57:20,6536303182814044,2014-Aug-22 02:57:20
2,Sale-8209-xGVn,Jeremy Spencer,832.05,1979-05-22,02:10:06,213185615148626,1979-May-22 02:10:06
3,Sale-9093-bfcp,Pamela Anderson,166.57,1973-12-24,21:28:34,3558512811558836,1973-Dec-24 21:28:34
4,Sale-8141-KGOb,Kenneth Marsh,498.43,1974-03-14,03:55:36,2239583806605394,1974-Mar-14 03:55:36


In [12]:

# drop a row
sales_df = sales_df.set_index('code')
sales_copy_df = sales_df.copy()
sales_copy_df['country'] = 'Colombia'
print(sales_copy_df.head())
sales_copy_df = sales_copy_df.drop(index='Sale-8141-KGOb')
sales_copy_df.head()

                         client  total_price       date      hour  \
code                                                                
Sale-1117-HdZH        Gary Meza       832.48 2024-06-03  08:13:18   
Sale-5078-hqkc     Carol Martin       156.29 2014-08-22  02:57:20   
Sale-8209-xGVn   Jeremy Spencer       832.05 1979-05-22  02:10:06   
Sale-9093-bfcp  Pamela Anderson       166.57 1973-12-24  21:28:34   
Sale-8141-KGOb    Kenneth Marsh       498.43 1974-03-14  03:55:36   

                     credit_card             date_full   timestamp   country  
code                                                                          
Sale-1117-HdZH  6011811575065598  2024-Jun-03 08:13:18  1717402398  Colombia  
Sale-5078-hqkc  6536303182814044  2014-Aug-22 02:57:20  1408676240  Colombia  
Sale-8209-xGVn   213185615148626  1979-May-22 02:10:06   296187006  Colombia  
Sale-9093-bfcp  3558512811558836  1973-Dec-24 21:28:34   125616514  Colombia  
Sale-8141-KGOb  2239583806605394  1974-Mar

Unnamed: 0_level_0,client,total_price,date,hour,credit_card,date_full,timestamp,country
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Sale-1117-HdZH,Gary Meza,832.48,2024-06-03,08:13:18,6011811575065598,2024-Jun-03 08:13:18,1717402398,Colombia
Sale-5078-hqkc,Carol Martin,156.29,2014-08-22,02:57:20,6536303182814044,2014-Aug-22 02:57:20,1408676240,Colombia
Sale-8209-xGVn,Jeremy Spencer,832.05,1979-05-22,02:10:06,213185615148626,1979-May-22 02:10:06,296187006,Colombia
Sale-9093-bfcp,Pamela Anderson,166.57,1973-12-24,21:28:34,3558512811558836,1973-Dec-24 21:28:34,125616514,Colombia
Sale-7567-cCLb,William Smith,857.13,2016-07-04,18:26:22,3541411469135312,2016-Jul-04 18:26:22,1467656782,Colombia


In [None]:
# query a dataframe by column
clients = sales_df['client']
print(type(clients))
print(clients.head())

In [13]:
# query a dataframe by row with loc
sales_df.loc['Sale-8209-xGVn', ['client', 'date']]


client         Jeremy Spencer
date      1979-05-22 00:00:00
Name: Sale-8209-xGVn, dtype: object

In [None]:
# query a dataframe by row with iloc
sales_df.iloc[2:5]

In [None]:
# query a dataframe using a boolean mask
sales_df[sales_df['total_price'] > 600].head()

In [None]:
sales_df[(sales_df['total_price'] > 600) & (sales_df['date'] > '2016-01-01')].head()

In [None]:
# change all column names to Capital Case
sales_df.columns = sales_df.columns.str.capitalize()
sales_df.head()

In [None]:
print(sales_df.head())
sales_df = sales_df.reset_index()
sales_df.head()

In [None]:
# get availables values in a column
print(sales_df['Date'].size)
print(sales_df['Date'].unique())

In [None]:
# query a dataframe using query()
sales_df.query('Total_price > 600').head()

In [None]:
sales_df.query('Total_price > 600 and Date > "2016-01-01"').head()

In [None]:
# get missing values using isnull()
call_center_df = pd.read_json('json-files/call_center_comments.json')
print(call_center_df.info())
print("*"*50)
print(call_center_df.isnull().sum())

In [None]:
# fill missing values using fillna()
# fill missing attention_time with average time
import numpy as np
call_center_filled_df = call_center_df.copy()
call_center_filled_df['attention_time'] = call_center_filled_df['attention_time'].fillna(np.mean(call_center_df['attention_time']))
print(call_center_filled_df.isnull().sum())

# fill missing city with not reported
call_center_filled_df['city'] = call_center_filled_df['city'].fillna('Not reported')
print(call_center_filled_df.isnull().sum())

# fill missing values using fillna() in date_time column with interopolation
call_center_filled_df['date_time'] = call_center_filled_df['date_time'].interpolate()
print(call_center_filled_df.isnull().sum())
print(call_center_filled_df.info())
call_center_filled_df.head()


In [None]:
# drop missing values using dropna()
cleaned_df = call_center_df.dropna()
print(cleaned_df.info())

In [None]:
# transform column using to_datetime()
call_center_df['dt_str'] = call_center_df['date_time'].astype(str)
print(call_center_df.info())
call_center_df['dt'] = pd.to_datetime(call_center_df['dt_str'])
print(call_center_df.info())

In [None]:
# transform column using to_numeric()
call_center_df['at_str'] = call_center_df['attention_time'].astype(str)
print(call_center_df.info())
call_center_df['at'] = pd.to_numeric(call_center_df['at_str'], errors='coerce')
print(call_center_df.info())

In [None]:
# convert column to category using astype()
call_center_df = call_center_df.drop(columns=['dt_str', 'at_str'])
del call_center_df['dt']
del call_center_df['at']
call_center_df = call_center_df.rename(columns={'country_of_origin': 'country'})
call_center_df['country'] = call_center_df['country'].astype('category')
call_center_df['city'] = call_center_df['city'].astype('category')
print(call_center_df.info())

### Students Info for Merge and Joining

In [16]:
df_csv.columns = df_csv.columns.str.lower()
df_csv.head()

Unnamed: 0,name,company,position,salary
0,Alice Johnson,"Hernandez, Griffith and Nelson",Petroleum engineer,4740
1,David Jones,Gomez-Garcia,"Geologist, engineering",73329
2,Eva Brown,Blevins LLC,Microbiologist,83245
3,Frank Davis,Greene-Wilson,Museum education officer,74390
4,Jack Anderson,Butler PLC,"Scientist, research (maths)",69851


In [17]:
df_json.head()

Unnamed: 0,id,name,career,college
0,1,Alice Johnson,Computer Science,Tech University
1,2,Bob Smith,Mechanical Engineering,Engineering Institute
2,3,Carol Williams,Electrical Engineering,Tech University
3,4,David Jones,Biology,Science College
4,5,Eva Brown,Physics,Tech University


In [19]:
# merge dataframes using merge()
students_merge_df = pd.merge(df_csv, df_json, on='name')
students_merge_df.to_csv('students_merge.csv', index=False)
students_merge_df.head()

Unnamed: 0,name,company,position,salary,id,career,college
0,Alice Johnson,"Hernandez, Griffith and Nelson",Petroleum engineer,4740,1,Computer Science,Tech University
1,David Jones,Gomez-Garcia,"Geologist, engineering",73329,4,Biology,Science College
2,Eva Brown,Blevins LLC,Microbiologist,83245,5,Physics,Tech University
3,Frank Davis,Greene-Wilson,Museum education officer,74390,6,Chemistry,Science College
4,Jack Anderson,Butler PLC,"Scientist, research (maths)",69851,10,Software Engineering,Tech University


In [20]:
# concatenate dataframes using concat()
students_concat_df = pd.concat([df_csv, df_json], axis=0)
students_concat_df.to_csv('students_concat.csv', index=False)
students_concat_df.head()

Unnamed: 0,name,company,position,salary,id,career,college
0,Alice Johnson,"Hernandez, Griffith and Nelson",Petroleum engineer,4740.0,,,
1,David Jones,Gomez-Garcia,"Geologist, engineering",73329.0,,,
2,Eva Brown,Blevins LLC,Microbiologist,83245.0,,,
3,Frank Davis,Greene-Wilson,Museum education officer,74390.0,,,
4,Jack Anderson,Butler PLC,"Scientist, research (maths)",69851.0,,,


In [None]:
temp_concat_df = pd.concat([df_csv, df_csv.copy()], axis=0).reset_index(drop=True)
print(temp_concat_df.info())
print("*"*50)
temp_concat_df = temp_concat_df.drop_duplicates()
print(temp_concat_df.info())

In [None]:
df_csv = df_csv.set_index('name')
df_json = df_json.set_index('name')

In [21]:
# join dataframes using join()
students_join_df = df_csv.join(df_json, how='inner')
students_join_df.to_csv('students_join.csv')
students_join_df.head()

ValueError: columns overlap but no suffix specified: Index(['name'], dtype='object')

### Coming Back to the CallCenter

In [None]:
# group dataframes using groupby()
# groupby by country
for country, country_df in call_center_df.groupby('country'):
    print(country)
    print(country_df.head())

In [None]:
# groupby by country and city
for (country, city), country_city_df in call_center_df.groupby(['country', 'city']):
    print(country, city)
    print(country_city_df.head())

In [None]:
# group and aggregate dataframes using groupby() and aggregate()
# grup by country and get the mean, std, min, max  of attention_time
grouped_country_df = call_center_df.groupby('country').agg({'attention_time': ['mean', 'std', np.min, np.nanmax]})
grouped_country_df

In [None]:
# group by country and city and get the mean, std, min, max of attention_time
grouped_country_city_df = call_center_df.groupby(['country', 'city']).agg({'attention_time': ['mean', 'std', np.min, np.nanmax]})
grouped_country_city_df = grouped_country_city_df.dropna()
grouped_country_city_df.to_csv('csv-files/grouped_country_city.csv')
grouped_country_city_df

In [None]:
grouped_country_city_df = grouped_country_city_df.reset_index()
grouped_country_city_df.head()

In [None]:
# group and transform dataframes using groupby() and transform()
call_center_df['attention_time_mean'] = call_center_df.groupby('country')['attention_time'].transform(np.mean)
call_center_df['attention_time_std'] = call_center_df.groupby('country')['attention_time'].transform('std')
call_center_df = call_center_df.sort_values(by='country')
print(call_center_df.shape)
call_center_df.head()

In [None]:
print(call_center_df['attention_time_mean'].unique())

In [None]:
# group and filter dataframes using groupby() and filter()
call_center_df_filtered = call_center_df.groupby('country').filter(lambda x: x['attention_time'].mean() > 77.6)
print(call_center_df_filtered.shape)
print(call_center_df_filtered['attention_time_mean'].unique())
call_center_df_filtered.head()

In [None]:
# pivot dataframes using pivot_table()
def create_category(x):
    if x > 120:
        return 'Too Bad'
    elif x > 60:
        return 'Bad'
    elif x > 20:
        return 'Medium'
    else:
        return 'Acceptable'

call_center_df['attention_category'] = call_center_df['attention_time'].apply(lambda x: create_category(x))
call_center_df.to_csv('csv-files/call_center_data.csv', index=False)
call_center_df.pivot_table(values='attention_time', index='country', columns='attention_category', aggfunc=['mean', 'min', 'max'], observed=False).head()

## Advanced Transformations

In [None]:
# making transformations using apply()
def get_region(country):
    if country in ['Brazil']:
        return 'South America'
    elif country in ['USA', 'Canada', 'Mexico']:
        return 'North America'
    elif country in ['Spain', 'France', 'UK', 'Germany', 'Italy']:
        return 'Europe'
    elif country in ['China', 'South Korea', 'India', 'Japan']:
        return 'Asia'
    elif country in ['Russia']:
        return 'Asia/Europe'
    elif country in ['Australia']:
        return 'Oceania'
    else:
        return 'NA'

call_center_df['region'] = call_center_df['country'].apply(lambda x: get_region(x))
call_center_df.head()

In [None]:
# making transformations using chain transformations
# Assuming call_center_df is your DataFrame and get_region is a predefined function

transformed_df = (
    call_center_df
    .assign(
        attention_time=lambda x: x['attention_time'].fillna(x['attention_time'].mean()),
        city=lambda x: x['city'].astype(str).fillna('non-registered'),
        client=lambda x: x['client'].str.capitalize(),
        outlier=lambda x: (np.abs(x['attention_time_mean'] - x['attention_time']) >\
                           np.abs(x['attention_time_mean'] - x['attention_time_std'])).astype(int)
    )
    .dropna()
    .assign(region=lambda x: x['country'].apply(get_region).str.upper())
    .set_index('code')
    .round({'attention_time': 3})
    .rename(columns={'client': 'client_name'})
)

transformed_df.head()

## Statistical Testing

In [None]:
# making a t-test with pandas and scipy
# t-test is a statistical test that is used to compare the means of two groups
from scipy.stats import ttest_ind

group1 = call_center_df['group1_scores']
group2 = call_center_df['group2_scores']

t_stat, p_value = ttest_ind(group1, group2)
print(f"T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
# making an ANOVA test with pandas and scipy
# ANOVA test is a statistical test that is used to compare the means of three or more groups
from scipy.stats import f_oneway

group1 = call_center_df[call_center_df['group_column'] == 'Group1']['scores_column']
group2 = call_center_df[call_center_df['group_column'] == 'Group2']['scores_column']
group3 = call_center_df[call_center_df['group_column'] == 'Group3']['scores_column']

f_stat, p_value = f_oneway(group1, group2, group3)  
print(f"F-statistic: {f_stat}, P-value: {p_value}")

In [None]:
# making a chi-square test with pandas and scipy
# chi-square test is a statistical test that is used to compare the frequency of two or more groups
from scipy.stats import chi2_contingency

call_center_df['column1'] = call_center_df['column1'].astype('category')
call_center_df['column2'] = call_center_df['column2'].astype('category')

contingency_table = pd.crosstab(call_center_df['column1'], call_center_df['column2'])
chi2, p, dof, expected = chi2_contingency(contingency_table)


In [None]:
# making a correlation validation with pandas 
# correlation is a statistical test that is used to measure the relationship between two variables
call_center_df.corr()

In [None]:
# p-hacking example


In [None]:
# p-hacking example with multiple testing


In [None]:
# p-value example


In [None]:
# p-value correction with Bonferroni
