In [47]:
import pandas as pd


## load data file (CSV, Excel, JSON, etc.)
       - pd.read_csv()
       - pd.read_excel()
       - pd.read_json()

In [48]:
# load csv file
df=pd.read_csv("Sample_Data/sales_data_sample.csv", encoding="latin1")

# load excel file
# df1=pd.read_excel("SampleSuperstore.xlsx")

# # load json file
# df=pd.read_json("Sample_Data/sample_Data.json")


## create Series and DataFrame
    - pd.Series()
        -  a one-dimensional array, like a single column in a spreadsheet.
    - pd.DataFrame()
        - a two-dimensional, table-like structure with rows and columns, like an entire spreadsheet.

In [49]:
data={
    "name":["Alice","Bob","Charlie"],
    "city":["New York","Los Angeles","Chicago"]
}

dataFrame=pd.DataFrame(data)
print(f"dataFrame:\n{dataFrame}")

column=[10,20,30]
series=pd.Series(column)

print(f"\nseries:\n{series}")

dataFrame:
      name         city
0    Alice     New York
1      Bob  Los Angeles
2  Charlie      Chicago

series:
0    10
1    20
2    30
dtype: int64


## save data file (CSV, Excel, JSON, etc.)
       - df.to_csv()
       - df.to_excel()
       - df.to_json()

In [50]:
# to save in csv file
dataFrame.to_csv("data.csv",index=True)

# to save in Excel file
# dataFrame.to_excel("data.xlsx")

# to save in json file
dataFrame.to_json("data.json")

# DataFrame attributes
       - df.shape
       - df.columns
       - df.index
       - df.dtypes
       - df.info()
       - df.describe()
       - df.head()
       - df.tail()

In [51]:
# df.shape :- An attribute that returns a tuple representing the dimensions of the DataFrame (rows, columns).
print("shape = ",df.shape)

shape =  (2823, 25)


In [52]:
# df.columns :- An attribute that returns the column labels (headers) of the DataFrame.
print("column name = \n",df.columns)

column name = 
 Index(['ORDERNUMBER', 'QUANTITYORDERED', 'PRICEEACH', 'ORDERLINENUMBER',
       'SALES', 'ORDERDATE', 'STATUS', 'QTR_ID', 'MONTH_ID', 'YEAR_ID',
       'PRODUCTLINE', 'MSRP', 'PRODUCTCODE', 'CUSTOMERNAME', 'PHONE',
       'ADDRESSLINE1', 'ADDRESSLINE2', 'CITY', 'STATE', 'POSTALCODE',
       'COUNTRY', 'TERRITORY', 'CONTACTLASTNAME', 'CONTACTFIRSTNAME',
       'DEALSIZE'],
      dtype='object')


In [53]:
# df.index :- An attribute that returns the index (row labels) of the DataFrame.
print("index = ",df.index)


index =  RangeIndex(start=0, stop=2823, step=1)


In [54]:
# df.dtypes :- An attribute that returns the data type of each column (e.g., int64, float64, object).
print("datatype of columns = \n",df.dtypes)

datatype of columns = 
 ORDERNUMBER           int64
QUANTITYORDERED       int64
PRICEEACH           float64
ORDERLINENUMBER       int64
SALES               float64
ORDERDATE            object
STATUS               object
QTR_ID                int64
MONTH_ID              int64
YEAR_ID               int64
PRODUCTLINE          object
MSRP                  int64
PRODUCTCODE          object
CUSTOMERNAME         object
PHONE                object
ADDRESSLINE1         object
ADDRESSLINE2         object
CITY                 object
STATE                object
POSTALCODE           object
COUNTRY              object
TERRITORY            object
CONTACTLASTNAME      object
CONTACTFIRSTNAME     object
DEALSIZE             object
dtype: object


In [55]:
# df.info() :- A method that prints a concise summary of the DataFrame, including the index and column types, non-null values, and memory usage.
print("info of dataset = \n",df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2823 entries, 0 to 2822
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ORDERNUMBER       2823 non-null   int64  
 1   QUANTITYORDERED   2823 non-null   int64  
 2   PRICEEACH         2823 non-null   float64
 3   ORDERLINENUMBER   2823 non-null   int64  
 4   SALES             2823 non-null   float64
 5   ORDERDATE         2823 non-null   object 
 6   STATUS            2823 non-null   object 
 7   QTR_ID            2823 non-null   int64  
 8   MONTH_ID          2823 non-null   int64  
 9   YEAR_ID           2823 non-null   int64  
 10  PRODUCTLINE       2823 non-null   object 
 11  MSRP              2823 non-null   int64  
 12  PRODUCTCODE       2823 non-null   object 
 13  CUSTOMERNAME      2823 non-null   object 
 14  PHONE             2823 non-null   object 
 15  ADDRESSLINE1      2823 non-null   object 
 16  ADDRESSLINE2      302 non-null    object 


In [56]:
# df.describe() :- A method that generates descriptive statistics for numerical columns, such as count, mean, standard deviation, min, max, and percentiles.
print("description of dataset = \n",df.describe())

description of dataset = 
         ORDERNUMBER  QUANTITYORDERED    PRICEEACH  ORDERLINENUMBER  \
count   2823.000000      2823.000000  2823.000000      2823.000000   
mean   10258.725115        35.092809    83.658544         6.466171   
std       92.085478         9.741443    20.174277         4.225841   
min    10100.000000         6.000000    26.880000         1.000000   
25%    10180.000000        27.000000    68.860000         3.000000   
50%    10262.000000        35.000000    95.700000         6.000000   
75%    10333.500000        43.000000   100.000000         9.000000   
max    10425.000000        97.000000   100.000000        18.000000   

              SALES       QTR_ID     MONTH_ID     YEAR_ID         MSRP  
count   2823.000000  2823.000000  2823.000000  2823.00000  2823.000000  
mean    3553.889072     2.717676     7.092455  2003.81509   100.715551  
std     1841.865106     1.203878     3.656633     0.69967    40.187912  
min      482.130000     1.000000     1.000000  200

In [57]:
# df.head(n=5) :- A method that returns the first n rows of the DataFrame (default is 5). It's useful for a quick peek at the data.
print("first 5 rows of dataset = \n",df.head())

first 5 rows of dataset = 
    ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
0        10107               30      95.70                2  2871.00   
1        10121               34      81.35                5  2765.90   
2        10134               41      94.74                2  3884.34   
3        10145               45      83.26                6  3746.70   
4        10159               49     100.00               14  5205.27   

         ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
0   2/24/2003 0:00  Shipped       1         2     2003  ...   
1    5/7/2003 0:00  Shipped       2         5     2003  ...   
2    7/1/2003 0:00  Shipped       3         7     2003  ...   
3   8/25/2003 0:00  Shipped       3         8     2003  ...   
4  10/10/2003 0:00  Shipped       4        10     2003  ...   

                    ADDRESSLINE1  ADDRESSLINE2           CITY STATE  \
0        897 Long Airport Avenue           NaN            NYC    NY   
1             59 r

In [58]:
# df.tail(n=5) :- A method that returns the last n rows of the DataFrame (default is 5). It's useful for checking the end of the data.
print("last 5 rows of dataset = \n",df.tail())

last 5 rows of dataset = 
       ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
2818        10350               20     100.00               15  2244.40   
2819        10373               29     100.00                1  3978.51   
2820        10386               43     100.00                4  5417.57   
2821        10397               34      62.24                1  2116.16   
2822        10414               47      65.52                9  3079.44   

           ORDERDATE    STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
2818  12/2/2004 0:00   Shipped       4        12     2004  ...   
2819  1/31/2005 0:00   Shipped       1         1     2005  ...   
2820   3/1/2005 0:00  Resolved       1         3     2005  ...   
2821  3/28/2005 0:00   Shipped       1         3     2005  ...   
2822   5/6/2005 0:00   On Hold       2         5     2005  ...   

               ADDRESSLINE1  ADDRESSLINE2      CITY STATE POSTALCODE  COUNTRY  \
2818     C/ Moralzarzal, 86           NaN   

# Series attributes
       - series.shape
       - series.index
       - series.dtype
       - series.values
       - series.head()
       - series.tail()
       - series.describe()

In [59]:
# series.shape :- An attribute that returns a tuple representing the dimensions of the Series (length,).
print("shape = ",series.shape)

shape =  (3,)


In [60]:
# series.index :- An attribute that returns the index (labels) of the Series.
print(f"index = {series.index}")

index = RangeIndex(start=0, stop=3, step=1)


In [61]:
# series.dtype :- An attribute that returns the data type of the elements in the Series.
print(f"datatype = {series.dtype}")

datatype = int64


In [62]:
# series.values :- An attribute that returns the data of the Series as a NumPy array.
print(f"values = \n{series.values}")

values = 
[10 20 30]


In [63]:
# series.head(n=5) :- A method that returns the first n elements of the Series (default is 5).
print(f"first 2 rows = \n{series.head(2)}")

first 2 rows = 
0    10
1    20
dtype: int64


In [64]:
# series.tail(n=5) :- A method that returns the last n elements of the Series (default is 5).
print(f"last 2 rows = \n{series.tail(2)}")

last 2 rows = 
1    20
2    30
dtype: int64


In [66]:
# series.describe() :- A method that generates descriptive statistics for the Series, such as count, mean, standard deviation, and quartiles.
print(f"description = \n{series.describe()}")

<class 'pandas.core.series.Series'>
RangeIndex: 3 entries, 0 to 2
Series name: None
Non-Null Count  Dtype
--------------  -----
3 non-null      int64
dtypes: int64(1)
memory usage: 156.0 bytes
description = 
None


## Selecting data
       - df['column_name']
       - df[['col1','col2']]
       - df.iloc[row_index, col_index]
       - df.loc[row_label, col_label]
       - df[condition]
            - condition example:
            (1) for single condition
                    df[df['column_name'] > value]
            (2) for multiple conditions (and ( & ), or ( |))
                    df[(df['col1'] > value1) & (df['col2'] < value2)]
