In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.DataFrame({'date': ['3/10/2000', '3/11/2000', '3/12/2000'],
                   'value': [2, 3, 4]})


In [4]:
df

Unnamed: 0,date,value
0,3/10/2000,2
1,3/11/2000,3
2,3/12/2000,4


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3 non-null      object
 1   value   3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [9]:
df['date'] = pd.to_datetime(df['date'], dayfirst = True)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3 non-null      datetime64[ns]
 1   value   3 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 176.0 bytes


In [11]:
df.head()

Unnamed: 0,date,value
0,2000-03-10,2
1,2000-03-11,3
2,2000-03-12,4


In [15]:
df = pd.DataFrame({'date': ['2016-6-10 20:30:0', 
                            '2016-7-1 19:45:30', 
                            '2013-10-12 4:5:1'],
                   'value': [2, 3, 4]})
df['date'] = pd.to_datetime(df['date'], format = '%Y-%d-%m %H:%M:%S')

In [16]:
df

Unnamed: 0,date,value
0,2016-10-06 20:30:00,2
1,2016-01-07 19:45:30,3
2,2013-12-10 04:05:01,4


In [20]:
df = pd.DataFrame({'date': ['3/10/2000', 'a/11/2000', '3/12/2000'],
                   'value': [2, 3, 4]})
df['date'] = pd.to_datetime(df['date'], errors = 'coerce')

In [21]:
df

Unnamed: 0,date,value
0,2000-03-10,2
1,NaT,3
2,2000-03-12,4


In [22]:
df = pd.DataFrame({'year': [2015, 2016],
                   'month': [2, 3],
                   'day': [4, 5]})


In [23]:
df

Unnamed: 0,year,month,day
0,2015,2,4
1,2016,3,5


In [26]:
df['date'] = pd.to_datetime(df)

In [27]:
df

Unnamed: 0,year,month,day,date
0,2015,2,4,2015-02-04
1,2016,3,5,2016-03-05


In [28]:
df = pd.DataFrame({'name': ['Tom', 'Andy', 'Lucas'],
                 'DoB': ['08-05-1997', '04-28-1996', '12-16-1995']})


In [29]:
df

Unnamed: 0,name,DoB
0,Tom,08-05-1997
1,Andy,04-28-1996
2,Lucas,12-16-1995


In [30]:
df["DoB"] = pd.to_datetime(df["DoB"])

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   name    3 non-null      object        
 1   DoB     3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 176.0+ bytes


In [32]:
df['Year'] =  df['DoB'].dt.year
df['Month'] =  df['DoB'].dt.month
df['Day'] =  df['DoB'].dt.day

df

Unnamed: 0,name,DoB,Year,Month,Day
0,Tom,1997-08-05,1997,8,5
1,Andy,1996-04-28,1996,4,28
2,Lucas,1995-12-16,1995,12,16


In [34]:
today = pd.to_datetime('today')
today

Timestamp('2024-08-16 12:08:17.858270')

In [40]:
df['age'] = today.year - df['DoB'].dt.year

In [41]:
df

Unnamed: 0,name,DoB,Year,Month,Day,age
0,Tom,1997-08-05,1997,8,5,27
1,Andy,1996-04-28,1996,4,28,28
2,Lucas,1995-12-16,1995,12,16,29


In [44]:
# based on the Year Difference
today = pd.to_datetime('today')
diff_y = today.year - df['DoB'].dt.year

b_md = df['DoB'].apply(lambda x : (x.month, x.year))
no_birthday = b_md > (today.month, today.year)

df['age'] =  diff_y - no_birthday

In [43]:
no_birthday

0    False
1    False
2     True
Name: DoB, dtype: bool

In [45]:
df

Unnamed: 0,name,DoB,Year,Month,Day,age
0,Tom,1997-08-05,1997,8,5,27
1,Andy,1996-04-28,1996,4,28,28
2,Lucas,1995-12-16,1995,12,16,28


# Working with different file formats (JSON, CSV, Excel, Avro)


In [46]:
import json

In [47]:
data = {
    "president": {
        'name' : "Elephant",
        'species' : 'Amphibians'
    }
}

In [48]:
with open ('data_file.json', 'w') as write_files:
    json.dump(data, write_files)

In [51]:
with open ('data_file.json', 'r') as read_files:
    data = json.load(read_files)

In [52]:
type(data)

dict

In [53]:
data

{'president': {'name': 'Elephant', 'species': 'Amphibians'}}

In [54]:
pip install openpyxl

Collecting openpyxl
  Obtaining dependency information for openpyxl from https://files.pythonhosted.org/packages/c0/da/977ded879c29cbd04de313843e76868e6e13408a94ed6b987245dc7c8506/openpyxl-3.1.5-py2.py3-none-any.whl.metadata
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Obtaining dependency information for et-xmlfile from https://files.pythonhosted.org/packages/96/c2/3dd434b0108730014f1b96fd286040dc3bcb70066346f7e01ec2ac95865f/et_xmlfile-1.1.0-py3-none-any.whl.metadata
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
   ---------------------------------------- 0.0/250.9 kB ? eta -:--:--
   -------------- ------------------------- 92.2/250.9 kB 2.6 MB/s eta 0:00:01
   ---------------------------------------- 250.9/250.9 kB 3.0 MB/s eta 0:00:00
Using cached et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Suc


[notice] A new release of pip is available: 23.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [55]:
import pandas as pd

jsonStr = '''{"Index0":{"Courses": "Pandas","Discount": "1200"},
           "Index1":{"Courses": "Hadoop","Discount": "1500"},
           "Index2":{"Courses": "Spark","Discount": "1800"}
          }'''
df2 = pd.read_json(jsonStr, orient = 'index')

  df2 = pd.read_json(jsonStr, orient = 'index')


In [56]:
df2

Unnamed: 0,Courses,Discount
Index0,Pandas,1200
Index1,Hadoop,1500
Index2,Spark,1800


In [57]:
data['president']


{'name': 'Elephant', 'species': 'Amphibians'}

In [58]:
df3 = pd.DataFrame.from_dict(data, orient = 'index')

In [59]:
df3

Unnamed: 0,name,species
president,Elephant,Amphibians


In [60]:
pip install avro-python3

Collecting avro-python3
  Downloading avro-python3-1.10.2.tar.gz (38 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Installing backend dependencies: started
  Installing backend dependencies: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: avro-python3
  Building wheel for avro-python3 (pyproject.toml): started
  Building wheel for avro-python3 (pyproject.toml): finished with status 'done'
  Created wheel for avro-python3: filename=avro_python3-1.10.2-py3-none-any.whl size=44038 sha256=45968beb0c39b9b42f66821ca69c15d499cccfcc3fbd38ebe3c24705aa8b67f6
  Stored in directory: c:\users\dinu8\appdata\local\pip\cache\wheels\bc\85\62\6cdd81c56f923946b401cecff38055b94c9b766927f7d8ca82
Succes


[notice] A new release of pip is available: 23.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [65]:
pip install pandavro

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [72]:
import copy
import json
import pandas as pd
from avro.datafile import DataFileReader
from avro.io import DatumReader

In [73]:
# Data to be saved
users = [{'name': 'Pierre-Simon Laplace', 'age': 77},
         {'name': 'John von Neumann', 'age': 53}]
users_df = pd.DataFrame.from_records(users)
print(users_df)

                   name  age
0  Pierre-Simon Laplace   77
1      John von Neumann   53
