## **Pandas**

In [1]:
import pandas as pd

mydataset = {
  'cars': ["BMW", "Volvo", "Ford"],
  'passings': [3, 7, 2]
}

myvar = pd.DataFrame(mydataset)

print(myvar)

print(pd.__version__)

    cars  passings
0    BMW         3
1  Volvo         7
2   Ford         2
1.3.5


In [2]:
a = [1, 7, 2]

myvar = pd.Series(a)

print(myvar)

print(myvar[0])

0    1
1    7
2    2
dtype: int64
1


In [3]:
a = [1, 7, 2]

myvar = pd.Series(a, index = ["x", "y", "z"])

print(myvar)
print(myvar["y"]) 

x    1
y    7
z    2
dtype: int64
7


## Key/Value Objects as **Series**

In [4]:

calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories)

print(myvar)

day1    420
day2    380
day3    390
dtype: int64


In [6]:

calories = {"day1": 420, "day2": 380, "day3": 390}

myvar = pd.Series(calories, index = ["day1", "day2"])

print(myvar)


day1    420
day2    380
dtype: int64


## **DataFrame**

In [None]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

myvar = pd.DataFrame(data)

print(myvar)

In [8]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

#load data into a DataFrame object:
df = pd.DataFrame(data)

print(df) 
###Locate Row
#refer to the row index:
print(df.loc[0])
#use a list of indexes:
print(df.loc[[0, 1]])



   calories  duration
0       420        50
1       380        40
2       390        45
calories    420
duration     50
Name: 0, dtype: int64
   calories  duration
0       420        50
1       380        40


## Named **indexes**

In [9]:
data = {
  "calories": [420, 380, 390],
  "duration": [50, 40, 45]
}

df = pd.DataFrame(data, index = ["day1", "day2", "day3"])

print(df) 
##Locate Named Indexes
#refer to the named index:
print(df.loc["day2"])


      calories  duration
day1       420        50
day2       380        40
day3       390        45
calories    380
duration     40
Name: day2, dtype: int64


Load file into dataframe

In [11]:
###Load Files Into a DataFrame

df = pd.read_csv('data.csv')

print(df) 


FileNotFoundError: ignored

## pandas read **csv**

In [None]:
#pandas read CSV

df = pd.read_csv('data.csv')

print(df.to_string()) 


df = pd.read_csv('data.csv')

print(df) 

In [13]:
######max_rows

print(pd.options.display.max_rows)

60


In [14]:
###Increase the maximum number of rows to display the entire DataFrame:


pd.options.display.max_rows = 9999

df = pd.read_csv('data.csv')

print(df) 


FileNotFoundError: ignored

Reas json

In [15]:
df = pd.read_json('data.json')

print(df.to_string()) 

###Dictionary as JSON
import pandas as pd

data = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  },
  "Maxpulse":{
    "0":130,
    "1":145,
    "2":135,
    "3":175,
    "4":148,
    "5":127
  },
  "Calories":{
    "0":409,
    "1":479,
    "2":340,
    "3":282,
    "4":406,
    "5":300
  }
}

df = pd.DataFrame(data)

print(df) 

ValueError: ignored

# #Pandas - Analyzing **DataFrames**

In [None]:

df = pd.read_csv('data.csv')

print(df.head(10))


df = pd.read_csv('data.csv')

print(df.head())

print(df.tail()) 
print(df.info()) 

In [None]:
#The info() 
#method also tells us how many Non-Null values there are present in each column, 
#and in our data set it seems like there are 164 of 169 Non-Null values in the 
#"Calories" column

##Pandas - Cleaning **Data**

In [None]:

df = pd.read_csv('data.csv')

new_df = df.dropna()

print(new_df.to_string())


df = pd.read_csv('data.csv')

df.dropna(inplace = True)

print(df.to_string())

#Replace Empty Values


In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

df.fillna(130, inplace = True)

#Replace Only For Specified **Columns**

In [None]:
#Replace Only For Specified Columns
import pandas as pd

df = pd.read_csv('data.csv')

df["Calories"].fillna(130, inplace = True)

#Replace Using Mean, Median, or **Mode**

In [None]:
import pandas as pd

df = pd.read_csv('data.csv')

x = df["Calories"].mean()

df["Calories"].fillna(x, inplace = True)

import pandas as pd

df = pd.read_csv('data.csv')

x = df["Calories"].median()

df["Calories"].fillna(x, inplace = True)

import pandas as pd

df = pd.read_csv('data.csv')

x = df["Calories"].mode()[0]

df["Calories"].fillna(x, inplace = True)

#Pandas - Cleaning Data of Wrong **Format**

In [None]:
####Convert Into a Correct Format

import pandas as pd

df = pd.read_csv('data.csv')

df['Date'] = pd.to_datetime(df['Date'])

print(df.to_string())
####Removing Rows

df.dropna(subset=['Date'], inplace = True)


#Pandas - Fixing Wrong **Data**

In [None]:
##Replacing Values
df.loc[7, 'Duration'] = 45 #Set "Duration" = 45 in row 7:


#Loop through all values in the "Duration" column.
If the value is higher than 120, set it to 120:
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.loc[x, "Duration"] = 120



#Delete rows where "Duration" is higher than 120:
for x in df.index:
  if df.loc[x, "Duration"] > 120:
    df.drop(x, inplace = True)

#Pandas - Removing **Duplicates**

In [None]:
print(df.duplicated()) #Returns True for every row that is a duplicate, othwerwise False:

df.drop_duplicates(inplace = True)

## **Correlation**

In [None]:
df.corr()

##**plotting**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data.csv')

df.plot()

plt.show()
#Pandas uses the plot() method to create diagrams.


In [None]:
##Scatter Plot
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('data.csv')

df.plot(kind = 'scatter', x = 'Duration', y = 'Calories')

plt.show()

##Histogram
df["Duration"].plot(kind = 'hist')

