Pandas is a data manipulation library used for data analysis and cleaning. It provides two data stuctures: Series and dataframe. A series is 1D array-like object, while dataframe is a 2D size-mutable and potentially heterogenous tablular data structure with labeled axes.

In [1]:
import pandas as pd

In [3]:
#Series
#Pandas Series is a one-dimensional array-like object that can hold any data type.
#It is similar to a list or a column in a DataFrame.

data=[1, 2, 3, 4, 5]
s = pd.Series(data)
print("Series \n",s)

Series 
 0    1
1    2
2    3
3    4
4    5
dtype: int64


In [5]:
#series from dictionary
data_dict = {'a': 1, 'b': 2, 'c': 3}
s_dict = pd.Series(data_dict)
print(s_dict)

a    1
b    2
c    3
dtype: int64


In [6]:
data=[1, 2, 3, 4, 5]
s = pd.Series(data, index=['a', 'b', 'c', 'd', 'e'])
print(s)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [7]:
#dataframe
#Pandas DataFrame is a two-dimensional, size-mutable, potentially heterogeneous tabular data structure
#with labeled axes (rows and columns).

data = {    'Name': ['Alice', 'Bob', 'Charlie'],
    'Age': [25, 30, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']  
}
df = pd.DataFrame(data)
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [8]:
import numpy as np

np.array(df)

array([['Alice', 25, 'New York'],
       ['Bob', 30, 'Los Angeles'],
       ['Charlie', 35, 'Chicago']], dtype=object)

In [9]:
#dataframe from list of dictionaries
data = [
    {'Name': 'Alice', 'Age': 25, 'City': 'New York'},
    {'Name': 'Bob', 'Age': 30, 'City': 'Los Angeles'},
    {'Name': 'Charlie', 'Age': 35, 'City': 'Chicago'}
]

df_from_list = pd.DataFrame(data)
print(df_from_list)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [17]:
#read csv file

try:
    df_csv = pd.read_csv('sales_data.csv', encoding='latin1')
    print(df_csv.head(5))  # Display the first few rows of the DataFrame
except FileNotFoundError:
    print("Error: 'sales_data.csv' file not found. Please check the file path.")

   ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
0        10107               30      95.70                2  2871.00   
1        10121               34      81.35                5  2765.90   
2        10134               41      94.74                2  3884.34   
3        10145               45      83.26                6  3746.70   
4        10159               49     100.00               14  5205.27   

         ORDERDATE   STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
0   2/24/2003 0:00  Shipped       1         2     2003  ...   
1    5/7/2003 0:00  Shipped       2         5     2003  ...   
2    7/1/2003 0:00  Shipped       3         7     2003  ...   
3   8/25/2003 0:00  Shipped       3         8     2003  ...   
4  10/10/2003 0:00  Shipped       4        10     2003  ...   

                    ADDRESSLINE1  ADDRESSLINE2           CITY STATE  \
0        897 Long Airport Avenue           NaN            NYC    NY   
1             59 rue de l'Abbaye           NaN

In [18]:
print(df_csv.tail(5))  # Display the last few rows of the DataFrame

      ORDERNUMBER  QUANTITYORDERED  PRICEEACH  ORDERLINENUMBER    SALES  \
2818        10350               20     100.00               15  2244.40   
2819        10373               29     100.00                1  3978.51   
2820        10386               43     100.00                4  5417.57   
2821        10397               34      62.24                1  2116.16   
2822        10414               47      65.52                9  3079.44   

           ORDERDATE    STATUS  QTR_ID  MONTH_ID  YEAR_ID  ...  \
2818  12/2/2004 0:00   Shipped       4        12     2004  ...   
2819  1/31/2005 0:00   Shipped       1         1     2005  ...   
2820   3/1/2005 0:00  Resolved       1         3     2005  ...   
2821  3/28/2005 0:00   Shipped       1         3     2005  ...   
2822   5/6/2005 0:00   On Hold       2         5     2005  ...   

               ADDRESSLINE1  ADDRESSLINE2      CITY STATE POSTALCODE  COUNTRY  \
2818     C/ Moralzarzal, 86           NaN    Madrid   NaN      28034   

In [19]:
df

Unnamed: 0,Name,Age,City
0,Alice,25,New York
1,Bob,30,Los Angeles
2,Charlie,35,Chicago


In [20]:
df['Name']  # Accessing a column by name

0      Alice
1        Bob
2    Charlie
Name: Name, dtype: object

In [22]:
df.loc[0]  # Accessing the first row by label

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [23]:
df.iloc[0]  # Accessing the first row by index position

Name       Alice
Age           25
City    New York
Name: 0, dtype: object

In [26]:
df.iloc[0][2]  # Accessing the first row and third column by index position

  df.iloc[0][2]  # Accessing the first row and third column by index position


'New York'

In [27]:
##accessing a specified element

df.at[0, 'Name']  # Accessing the 'Name' column of the first row by label

'Alice'

In [28]:
#using iat
df.iat[0, 2]  # Accessing the first row and third column by index position

'New York'

In [34]:
#data manipulation with dataframe

# Adding a new column
df['Salary'] = [50000, 60000, 70000]
print(df)

      Name  Age         City  Salary
0    Alice   25     New York   50000
1      Bob   30  Los Angeles   60000
2  Charlie   35      Chicago   70000


In [35]:
#removing a column
df.drop('Salary', axis=1, inplace=True)  # Remove the 'Salary' column permanentely
print(df)

      Name  Age         City
0    Alice   25     New York
1      Bob   30  Los Angeles
2  Charlie   35      Chicago


In [36]:
df['Age']= df['Age'] + 5  # Incrementing the 'Age' column by 5
print(df)

      Name  Age         City
0    Alice   30     New York
1      Bob   35  Los Angeles
2  Charlie   40      Chicago


In [37]:
#deleting a row
df.drop(0, axis=0, inplace=True)  # Remove the first row permanentely
print(df)

      Name  Age         City
1      Bob   35  Los Angeles
2  Charlie   40      Chicago


In [38]:
#describing the data
print(df.describe())  # Get a statistical summary of the DataFrame

             Age
count   2.000000
mean   37.500000
std     3.535534
min    35.000000
25%    36.250000
50%    37.500000
75%    38.750000
max    40.000000
