# basics

In [81]:
import numpy as np
import pandas as pd

In [82]:
# creating a dataframe
df = pd.DataFrame(
    {
    "Name":["ammar", "mohammed", "salma"],
    "Age":[18, 26, 20],
    "Gender":["male", "male", "female"]
    }
) 
df

Unnamed: 0,Name,Age,Gender
0,ammar,18,male
1,mohammed,26,male
2,salma,20,female


In [83]:
# accessing a df
print(df["Age"])
highest_age = df["Age"].max()
highest_age_index = df["Age"].idxmax()
highest_age_person = df.loc[highest_age_index, "Name"]
print(f"highest age is {highest_age_person} with {highest_age}")

0    18
1    26
2    20
Name: Age, dtype: int64
highest age is mohammed with 26


In [84]:
# creating and adding a column to a dataframe
score = pd.Series([95, 100, 80], name = "Score")
df["Score"] = score
df

Unnamed: 0,Name,Age,Gender,Score
0,ammar,18,male,95
1,mohammed,26,male,100
2,salma,20,female,80


In [85]:
# creating and adding a row to a dataframe
s = pd.Series({"Name": "Abdullah", "Age":29, "Gender":"male", "Score":90})
df = pd.concat([df, s.to_frame().T], ignore_index=True)
df

Unnamed: 0,Name,Age,Gender,Score
0,ammar,18,male,95
1,mohammed,26,male,100
2,salma,20,female,80
3,Abdullah,29,male,90


In [86]:
# creating a df with random values
frame = pd.DataFrame(np.random.randn(1000, 5), columns=["a", "b", "c", "d", "e"])
frame

Unnamed: 0,a,b,c,d,e
0,0.402138,-1.209152,2.284759,0.057203,0.298675
1,0.239660,-0.832933,-0.157755,-0.418097,-0.027979
2,2.331020,0.728418,0.164615,-1.831252,1.153305
3,0.597519,0.584572,0.136205,2.917577,0.134153
4,1.337961,0.426735,-0.617580,-0.077445,0.208566
...,...,...,...,...,...
995,0.647394,0.583787,-0.004842,-0.259060,-0.861234
996,-0.365794,-1.199675,-0.660669,1.780094,2.260899
997,0.706823,-1.247054,-1.645525,-0.037371,0.229931
998,0.604604,-1.958296,-0.525714,0.520444,-0.497043


In [87]:
# finding basic statistics of numerical data in the dataframe
frame.describe()

Unnamed: 0,a,b,c,d,e
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,0.025075,0.011162,-0.012036,0.015647,0.019679
std,1.013424,0.97674,0.999497,1.014784,1.032255
min,-3.122314,-2.987782,-2.767588,-3.650668,-2.875889
25%,-0.652331,-0.654982,-0.688691,-0.663896,-0.679997
50%,0.007459,0.039943,-0.007024,0.025072,0.014024
75%,0.719439,0.680188,0.665341,0.68013,0.703127
max,2.889173,3.528409,3.552663,3.680377,3.024507


In [88]:
# finding basic statistics of non numerical data in the dataframe
df.describe()

Unnamed: 0,Name,Age,Gender,Score
count,4,4,4,4
unique,4,4,2,4
top,ammar,18,male,95
freq,1,1,3,1


# reading, writing, and converting (.txt, .csv, .xlsx) 

example 1: titanic.csv -> pd -> titanic.xlsx -> pd

In [None]:
# reading titanic.csv
titanic = pd.read_csv("data/titanic.csv")
titanic

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [None]:
# converting titanic.csv to titanic.xlsx
titanic = pd.read_csv("data/titanic.csv")
titanic.to_excel("data/titanic.xlsx", index=False)

In [112]:
# converting .xlsx to .csv
titanic = pd.read_excel("data/titanic.xlsx")
titanic.to_csv("data/titanic.csv", index=False)

Example 1: products.txt -> df -> products.csv -> df -> products.xlsx -> df

In [None]:
# converting products.txt to products.csv
products = pd.read_csv("data/products.txt", sep=",") #assuming comma-separated values
products.to_csv("data/products.csv", index=False)

In [None]:
# converting products.csv to products.xlsx
products = pd.read_csv("data/products.txt", sep=",")
products.to_excel("data/products.xlsx", index=False)

In [None]:
# reading products.xlsx
products = pd.read_excel("data/products.xlsx")
products

Unnamed: 0,ID,Name,Category,Price,Stock
0,1,Milk,Dairy,3.5,40
1,2,Bread,Bakery,2.0,60
2,3,Eggs,Dairy,5.0,30
3,4,Rice,Grains,10.0,25
4,5,Pasta,Grains,4.0,50
5,6,Chicken Breast,Meat,18.0,20
6,7,Apples,Fruit,6.0,45
7,8,Bananas,Fruit,4.0,55
8,9,Tomatoes,Vegetables,5.5,35
9,10,Potatoes,Vegetables,7.0,40


# selecting a subset of a DataFrame

#### select specific columns from a DataFrame

In [None]:
# age of the titanic passengers
ages = titanic["Age"]
ages.head()

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
Name: Age, dtype: float64

In [119]:
# age and sex of the titanic passengers
name_age_sex = titanic[["Name", "Age", "Sex"]]
name_age_sex.head()

Unnamed: 0,Name,Age,Sex
0,"Braund, Mr. Owen Harris",22.0,male
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",38.0,female
2,"Heikkinen, Miss Laina",26.0,female
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",35.0,female
4,"Allen, Mr. William Henry",35.0,male
