# Lecture - Pandas basics

- pandas.Series
- pandas.DataFrame
- read_csv
- indexing
- plotting

## Pandas Series

- can create from dictionary
- can create from list
- can create from np.array

In [4]:
import pandas as pd

programs_dict = dict(AI = 26, NET = 38, Java = 30, UX = 28)

programs_series = pd.Series(programs_dict)
programs_series


AI      26
NET     38
Java    30
UX      28
dtype: int64

In [8]:
#extract values through indexing
print(f"{programs_series[0] = }")
print(f"{programs_series[-1] = }")

print(f"{programs_series['UX'] = }")

# get keys
print(f"{programs_series.keys()}")
print(f"{programs_series.keys()[0]}")

programs_series[0] = 26
programs_series[-1] = 28
programs_series['UX'] = 28
Index(['AI', 'NET', 'Java', 'UX'], dtype='object')
AI


In [13]:
import random as rnd

rnd.seed(1337)

dice_series = pd.Series([rnd.randint(1, 6) for _ in range(10)])
# inspects the first five element(default) or n amount
dice_series.head(3)

0    5
1    5
2    6
dtype: int64

In [15]:
print(f"{dice_series.min() = }") # skriver ut det minsta värdet
print(f"{dice_series.max() = }") # skriver ut högsta värdet
print(f"{dice_series.argmin() = }") # ger indexen för min value
print(f"{dice_series.argmax() = }") # ger indexen för max value
print(f"{dice_series.mean() = }") # medelvärdet - average
print(f"{dice_series.median() = }") # sort all values in order - pick the middle one, if middle are 2 numbers calculate average of them


dice_series.min() = 2
dice_series.max() = 6
dice_series.argmin() = 7
dice_series.mean() = 4.4
dice_series.median() = 5.0


-----
## DataFrame

- tabular data with rows and columns
- analog to 2D numpy arrays with flexible row indices and col names
- "specialized dictionary with col name mapped to a Series object" 

In [20]:
# instantiated a DataFrame from a Series object 
df_programs = pd.DataFrame(programs_series, columns=("Number_of_students", ))
df_programs

Unnamed: 0,Number_of_students
AI,26
NET,38
Java,30
UX,28


In [23]:
# create 2 Series objects
students = pd.Series({"AI": 26, "NET": 38, "UX": 28, "Java": 30})
skills = pd.Series({"AI": "Python", "Net": "C#", "UX": "Figma",  "Java": "Java"})


# create a DataFrame from 2 Series objects

df_programs = pd.DataFrame({"Students": students, "Skills": skills})
df_programs

Unnamed: 0,Students,Skills
AI,26.0,Python
Java,30.0,Java
NET,38.0,
Net,,C#
UX,28.0,Figma


In [25]:
df_programs["Students"].mean(), (26+30+38+28)/4 # plockar ut en utav listorna


(30.5, 30.5)

In [26]:
median_students_number = df_programs["Students"].median()
print(f"Meidan students in the programs {df_programs.index.to_list()} is {median_students_number:.0f}")

Meidan students in the programs ['AI', 'Java', 'NET', 'Net', 'UX'] is 29


In [None]:
df_programs["Skills"][0], # osv

## Indexers

- loc - slicing and indexing using explicit index
- iloc - slicing and indexing using python style indexing

In [28]:
df_programs.loc["AI"] # accessar hela raden "AI"

# returns a series object

Students      26.0
Skills      Python
Name: AI, dtype: object

In [None]:
df_programs.loc["Java"] # returns a DataFrame object

In [31]:
df_programs.iloc[1:2] # väljer hur många rader och kolumner att visa

Unnamed: 0,Students,Skills
Java,30.0,Java


## Masking

In [32]:
# Kollar hur många av objekten i listorna eller dictionaries är en siffra eller högre, funkar som vanligt
# kommer ut som bools (true, false)
df_programs["Students"] >= 30

AI      False
Java     True
NET      True
Net     False
UX      False
Name: Students, dtype: bool

In [33]:
df_programs[df_programs["Students"] >= 30] # filtrerad DataFrame

Unnamed: 0,Students,Skills
Java,30.0,Java
NET,38.0,
