Python for Data Analysis

In [None]:
# 1. Import libraries
import pandas as pd    # for data analysis
import numpy as np     # for numeric operations

In [None]:
# 2. Read Excel or CSV file
# Example: if you have an Excel file "students.xlsx"
df = pd.read_excel("students.xlsx")

# If itâ€™s a CSV file
df = pd.read_csv("students.csv")


In [None]:
data = {
    "name": ["Aruzhan", "Dias", "Aliya", "Miras", "Dana", "Askar", "Aigerim"],
    "age": [22, 25, 23, 30, 28, 35, 27],
    "city": ["Almaty", "Astana", "Shymkent", "Almaty", "Astana", "Karaganda", "Almaty"],
    "education": ["Bachelor", "Master", "Bachelor", "PhD", "Master", "Bachelor", "Master"],
    "income": [250000, 400000, 300000, 600000, 450000, 350000, 500000],
}

df = pd.DataFrame(data)
print(df)


      name  age       city education  income
0  Aruzhan   22     Almaty  Bachelor  250000
1     Dias   25     Astana    Master  400000
2    Aliya   23   Shymkent  Bachelor  300000
3    Miras   30     Almaty       PhD  600000
4     Dana   28     Astana    Master  450000
5    Askar   35  Karaganda  Bachelor  350000
6  Aigerim   27     Almaty    Master  500000


First Look at the Data

In [None]:
# 3. See first and last rows
print(df.head())   # first 5 rows
print(df.tail())   # last 5 rows

# 4. Shape of dataset (rows, columns)
print(df.shape)    # e.g., (100, 8)

# 5. Column names
print(df.columns)

# 6. Data types
print(df.dtypes)

# 7. Quick info summary
print(df.info())


      name  age      city education  income
0  Aruzhan   22    Almaty  Bachelor  250000
1     Dias   25    Astana    Master  400000
2    Aliya   23  Shymkent  Bachelor  300000
3    Miras   30    Almaty       PhD  600000
4     Dana   28    Astana    Master  450000
      name  age       city education  income
2    Aliya   23   Shymkent  Bachelor  300000
3    Miras   30     Almaty       PhD  600000
4     Dana   28     Astana    Master  450000
5    Askar   35  Karaganda  Bachelor  350000
6  Aigerim   27     Almaty    Master  500000
(7, 5)
Index(['name', 'age', 'city', 'education', 'income'], dtype='object')
name         object
age           int64
city         object
education    object
income        int64
dtype: object
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   name       7 non-null      object
 1   age        7 non-null      int64 
 2   city       7 n

Basic Descriptive Statistics

In [None]:
# 8. Summary statistics for numeric columns
print(df.describe())

# 9. For categorical columns (objects)
print(df.describe(include=["object"]))

# 10. Count missing values
print(df.isna().sum())

# 11. Count duplicates
print(df.duplicated().sum())


             age         income
count   7.000000       7.000000
mean   27.142857  407142.857143
std     4.450789  120514.768903
min    22.000000  250000.000000
25%    24.000000  325000.000000
50%    27.000000  400000.000000
75%    29.000000  475000.000000
max    35.000000  600000.000000
           name    city education
count         7       7         7
unique        7       4         3
top     Aruzhan  Almaty  Bachelor
freq          1       3         3
name         0
age          0
city         0
education    0
income       0
dtype: int64
0


Exploring Columns

In [None]:
# 12. Value counts for a categorical column
print(df["city"].value_counts(normalize = true))

# Include missing values in the count
print(df["city"].value_counts(dropna=False))

# 13. Unique values in a column
print(df["education"].unique())

# 14. Minimum, maximum
print(df["age"].min())
print(df["age"].max())

city
Almaty       3
Astana       2
Shymkent     1
Karaganda    1
Name: count, dtype: int64
city
Almaty       3
Astana       2
Shymkent     1
Karaganda    1
Name: count, dtype: int64
['Bachelor' 'Master' 'PhD']
22
35


Sorting & Filtering

In [None]:
# 15. Sort by column
df.sort_values("age", ascending=False).head()

Unnamed: 0,name,age,city,education,income
5,Askar,35,Karaganda,Bachelor,350000
3,Miras,30,Almaty,PhD,600000
4,Dana,28,Astana,Master,450000
6,Aigerim,27,Almaty,Master,500000
1,Dias,25,Astana,Master,400000


In [None]:
# 16. Filter rows
df[df["age"] > 30]

Unnamed: 0,name,age,city,education,income
5,Askar,35,Karaganda,Bachelor,350000


In [None]:
# Filter by multiple conditions
df[(df["age"] > 30) & (df["city"] == "Almaty")]

Unnamed: 0,name,age,city,education,income


Grouping & Aggregation

In [None]:
# 17. Group by city and calculate mean income
df.groupby("city")["income"].mean()


Unnamed: 0_level_0,income
city,Unnamed: 1_level_1
Almaty,450000.0
Astana,425000.0
Karaganda,350000.0
Shymkent,300000.0


In [None]:
# 18. Multiple aggregations
df.groupby("city")["income"].agg(["mean", "min", "max", "count"])


Unnamed: 0_level_0,mean,min,max,count
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Almaty,450000.0,250000,600000,3
Astana,425000.0,400000,450000,2
Karaganda,350000.0,350000,350000,1
Shymkent,300000.0,300000,300000,1


Pivot Table

In [None]:
# 19. Pivot table: average income by city and education
pd.pivot_table(df, values="income", index="city", columns="education", aggfunc="mean")


education,Bachelor,Master,PhD
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Almaty,250000.0,500000.0,600000.0
Astana,,425000.0,
Karaganda,350000.0,,
Shymkent,300000.0,,


In [None]:
from sklearn.datasets import load_iris
import pandas as pd

# Load iris dataset
iris = load_iris(as_frame=True)
df = iris.frame   # dataframe with features + target
print(df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  
