In [1]:
import pandas as pd
import numpy as np

In [2]:
weather_df = pd.read_csv("D:/data/weather_2012.csv")

print("Shape:", weather_df.shape)
print("Index:", weather_df.index)

Shape: (8784, 8)
Index: RangeIndex(start=0, stop=8784, step=1)


In [3]:
weather_df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog"
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog


In [4]:
weather_df['Date/Time'].head()

0    2012-01-01 00:00:00
1    2012-01-01 01:00:00
2    2012-01-01 02:00:00
3    2012-01-01 03:00:00
4    2012-01-01 04:00:00
Name: Date/Time, dtype: object

### Converting the Date/Time column from type object to timestamp

In [5]:
weather_df['Date/Time'] = pd.to_datetime(weather_df['Date/Time'])

In [6]:
weather_df['Date/Time'].head()

0   2012-01-01 00:00:00
1   2012-01-01 01:00:00
2   2012-01-01 02:00:00
3   2012-01-01 03:00:00
4   2012-01-01 04:00:00
Name: Date/Time, dtype: datetime64[ns]

In [7]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Date/Time           8784 non-null   datetime64[ns]
 1   Temp (C)            8784 non-null   float64       
 2   Dew Point Temp (C)  8784 non-null   float64       
 3   Rel Hum (%)         8784 non-null   int64         
 4   Wind Spd (km/h)     8784 non-null   int64         
 5   Visibility (km)     8784 non-null   float64       
 6   Stn Press (kPa)     8784 non-null   float64       
 7   Weather             8784 non-null   object        
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 549.1+ KB


In [8]:
weather_df['Weather'].unique()

array(['Fog', 'Freezing Drizzle,Fog', 'Mostly Cloudy', 'Cloudy', 'Rain',
       'Rain Showers', 'Mainly Clear', 'Snow Showers', 'Snow', 'Clear',
       'Freezing Rain,Fog', 'Freezing Rain', 'Freezing Drizzle',
       'Rain,Snow', 'Moderate Snow', 'Freezing Drizzle,Snow',
       'Freezing Rain,Snow Grains', 'Snow,Blowing Snow', 'Freezing Fog',
       'Haze', 'Rain,Fog', 'Drizzle,Fog', 'Drizzle',
       'Freezing Drizzle,Haze', 'Freezing Rain,Haze', 'Snow,Haze',
       'Snow,Fog', 'Snow,Ice Pellets', 'Rain,Haze', 'Thunderstorms,Rain',
       'Thunderstorms,Rain Showers', 'Thunderstorms,Heavy Rain Showers',
       'Thunderstorms,Rain Showers,Fog', 'Thunderstorms',
       'Thunderstorms,Rain,Fog',
       'Thunderstorms,Moderate Rain Showers,Fog', 'Rain Showers,Fog',
       'Rain Showers,Snow Showers', 'Snow Pellets', 'Rain,Snow,Fog',
       'Moderate Rain,Fog', 'Freezing Rain,Ice Pellets,Fog',
       'Drizzle,Ice Pellets,Fog', 'Drizzle,Snow', 'Rain,Ice Pellets',
       'Drizzle,Snow,Fog', 

In [9]:
weather_df['Weather'].nunique()

50

In [10]:
weather_df['Weather'].value_counts()

Mainly Clear                               2106
Mostly Cloudy                              2069
Cloudy                                     1728
Clear                                      1326
Snow                                        390
Rain                                        306
Rain Showers                                188
Fog                                         150
Rain,Fog                                    116
Drizzle,Fog                                  80
Snow Showers                                 60
Drizzle                                      41
Snow,Fog                                     37
Snow,Blowing Snow                            19
Rain,Snow                                    18
Thunderstorms,Rain Showers                   16
Haze                                         16
Drizzle,Snow,Fog                             15
Freezing Rain                                14
Freezing Drizzle,Snow                        11
Freezing Drizzle                        

 ### using .loc and .iloc for selection by label and selection by position

In [11]:
weather_df.loc[0:5, ['Visibility (km)','Rel Hum (%)']]

Unnamed: 0,Visibility (km),Rel Hum (%)
0,8.0,86
1,8.0,87
2,4.0,89
3,4.0,88
4,4.8,88
5,6.4,87


In [12]:
weather_df[['Visibility (km)','Rel Hum (%)']].iloc[0:5]

Unnamed: 0,Visibility (km),Rel Hum (%)
0,8.0,86
1,8.0,87
2,4.0,89
3,4.0,88
4,4.8,88


### Finding instances when snow was recorded

In [13]:
snowed_filter = weather_df['Weather'].str.lower().str.contains('snow')

In [14]:
snowed_filter

0       False
1       False
2       False
3       False
4       False
        ...  
8779     True
8780     True
8781     True
8782     True
8783     True
Name: Weather, Length: 8784, dtype: bool

In [15]:
weather_df['Weather'].iloc[8779]

'Snow'

In [16]:
weather_df[snowed_filter]

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
41,2012-01-02 17:00:00,-2.1,-9.5,57,22,25.0,99.66,Snow Showers
44,2012-01-02 20:00:00,-5.6,-13.4,54,24,25.0,100.07,Snow Showers
45,2012-01-02 21:00:00,-5.8,-12.8,58,26,25.0,100.15,Snow Showers
47,2012-01-02 23:00:00,-7.4,-14.1,59,17,19.3,100.27,Snow Showers
48,2012-01-03 00:00:00,-9.0,-16.0,57,28,25.0,100.35,Snow Showers
...,...,...,...,...,...,...,...,...
8779,2012-12-31 19:00:00,0.1,-2.7,81,30,9.7,100.13,Snow
8780,2012-12-31 20:00:00,0.2,-2.4,83,24,9.7,100.03,Snow
8781,2012-12-31 21:00:00,-0.5,-1.5,93,28,4.8,99.95,Snow
8782,2012-12-31 22:00:00,-0.2,-1.8,89,28,9.7,99.91,Snow


### Find all instances when wind speed was above 24 and visibility was 25

In [17]:
df = weather_df[(weather_df['Wind Spd (km/h)'] > 24) & (weather_df['Visibility (km)'] == 25)]

In [18]:
df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
23,2012-01-01 23:00:00,5.3,2.0,79,30,25.0,99.31,Cloudy
24,2012-01-02 00:00:00,5.2,1.5,77,35,25.0,99.26,Rain Showers
25,2012-01-02 01:00:00,4.6,0.0,72,39,25.0,99.26,Cloudy
26,2012-01-02 02:00:00,3.9,-0.9,71,32,25.0,99.26,Mostly Cloudy
27,2012-01-02 03:00:00,3.7,-1.5,69,33,25.0,99.3,Mostly Cloudy


### Applying custom functions 

In [19]:
def times2(value):
    return value * 2

In [20]:
weather_df['Visibility (km)'].head()

0    8.0
1    8.0
2    4.0
3    4.0
4    4.8
Name: Visibility (km), dtype: float64

In [21]:
t2 = weather_df['Visibility (km)'].apply(times2)

In [22]:
t2.head()

0    16.0
1    16.0
2     8.0
3     8.0
4     9.6
Name: Visibility (km), dtype: float64

### Adding/Updating columns

In [23]:
visibility_in_meters = weather_df["Visibility (km)"] * 1000
weather_df["Visibility (m)"] = visibility_in_meters

weather_df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather,Visibility (m)
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog,8000.0
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog,8000.0
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog",4000.0
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog",4000.0
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog,4800.0


### Renaming Columns

In [24]:
# using the "inplace=True" to ensure the renaming has been assigned in the old DataFrame
weather_df.rename(columns={'Visibility (m)': 'Visibility (meters)'}, inplace=True)
weather_df.head()

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather,Visibility (meters)
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog,8000.0
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog,8000.0
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog",4000.0
3,2012-01-01 03:00:00,-1.5,-3.2,88,6,4.0,101.27,"Freezing Drizzle,Fog",4000.0
4,2012-01-01 04:00:00,-1.5,-3.3,88,7,4.8,101.23,Fog,4800.0


### Deleting Columns

In [25]:
weather_df.drop(labels=['Visibility (meters)'], axis=1).head(3) #X=0(X axis), x=1(Y-Axis,column)

Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Weather
0,2012-01-01 00:00:00,-1.8,-3.9,86,4,8.0,101.24,Fog
1,2012-01-01 01:00:00,-1.8,-3.7,87,4,8.0,101.24,Fog
2,2012-01-01 02:00:00,-1.8,-3.4,89,7,4.0,101.26,"Freezing Drizzle,Fog"


### Pivot table (Allows you to reorganize and summarize columns and rows of data in a dataframe)
#### Pivot tables provide an easy way to subset by one column and then apply to a calculation like a sum or a mean

In [26]:
data = {
    'A' : ['foo','foo','foo','bar','bar','bar'],
    'B' : ['one','one','two','two','one','one'],
    'C' : ['x','y','x','y','x','y'],
    'D' : [1, 3, 2, 5, 4, 1]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [27]:
pivot_df = df.pivot_table(values='D',   #which column do we want to aggregate 
                         index='A',     #Select a column to use as the new index
                         columns=['C'],  #Which values do we want to use as the new index
                         aggfunc=np.sum)   #aggregation function to use
pivot_df

C,x,y
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,4,6
foo,3,3


In [28]:
# convert it back to a simple index
pivot_df.reset_index()

C,A,x,y
0,bar,4,6
1,foo,3,3


### what is the avearage temperature recorded by month

In [29]:
mean_temperature_df = weather_df.pivot_table(values='Temp (C)', index=weather_df['Date/Time'].dt.month, aggfunc=np.mean)
mean_temperature_df

Unnamed: 0_level_0,Temp (C)
Date/Time,Unnamed: 1_level_1
1,-7.371505
2,-4.225
3,3.121237
4,7.009306
5,16.237769
6,20.134028
7,22.790054
8,22.279301
9,16.484444
10,10.954973


### Group By

In [30]:
mean_temperature_df2 = weather_df.groupby(weather_df['Date/Time'].dt.month).agg(np.mean).reset_index()
mean_temperature_df2

  mean_temperature_df2 = weather_df.groupby(weather_df['Date/Time'].dt.month).agg(np.mean).reset_index()


Unnamed: 0,Date/Time,Temp (C),Dew Point Temp (C),Rel Hum (%),Wind Spd (km/h),Visibility (km),Stn Press (kPa),Visibility (meters)
0,1,-7.371505,-12.294758,68.383065,18.108871,22.100269,101.005349,22100.268817
1,2,-4.225,-9.221695,68.956897,14.837644,25.182184,101.142414,25182.183908
2,3,3.121237,-3.488575,64.862903,14.514785,26.177957,101.335255,26177.956989
3,4,7.009306,-1.934583,56.15,17.369444,31.777083,100.716833,31777.083333
4,5,16.237769,8.08078,61.760753,12.846774,29.418548,101.057164,29418.548387
5,6,20.134028,11.738056,60.643056,14.681944,32.104167,100.784222,32104.166667
6,7,22.790054,14.59543,62.017473,11.887097,33.655108,100.828333,33655.107527
7,8,22.279301,15.644758,67.943548,13.931452,30.192608,100.927097,30192.607527
8,9,16.484444,10.757917,71.165278,14.108333,30.603472,101.087903,30603.472222
9,10,10.954973,6.533468,75.731183,15.475806,25.111022,100.909368,25111.021505


### Working with DateTime Formats
#### Converting Strings to datetime

In [31]:
df = pd.DataFrame({'date': ['3/10/2000', '3/11/2000', '3/12/2000'],
                  'value': [2, 3, 4]})

In [32]:
df

Unnamed: 0,date,value
0,3/10/2000,2
1,3/11/2000,3
2,3/12/2000,4


In [33]:
df.info() # the date is in type object

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3 non-null      object
 1   value   3 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 176.0+ bytes


In [34]:
df['date'] = pd.to_datetime(df['date'])
df.info()
df# date column is now in datetime format

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    3 non-null      datetime64[ns]
 1   value   3 non-null      int64         
dtypes: datetime64[ns](1), int64(1)
memory usage: 176.0 bytes


Unnamed: 0,date,value
0,2000-03-10,2
1,2000-03-11,3
2,2000-03-12,4


In [35]:
## Day first format
df['date'] = pd.to_datetime(df['date'], dayfirst=True)
df

Unnamed: 0,date,value
0,2000-03-10,2
1,2000-03-11,3
2,2000-03-12,4


In [36]:
###Custom format
df = pd.DataFrame({'date': ['2016-6-10 20:30:0',
                           '2016-7-1 19:45:30',
                           '2013-10-12 4:5:1'],
                  'value': [2, 3, 4]})
df['date'] = pd.to_datetime(df['date'], format="%Y-%d-%m %H:%M:%S")
df

Unnamed: 0,date,value
0,2016-10-06 20:30:00,2
1,2016-01-07 19:45:30,3
2,2013-12-10 04:05:01,4


In [37]:
## Assemble a datetime from multiple columns
df = pd.DataFrame({'year': [2015, 2016],
                  'month': [2, 3],
                  'day': [4, 5]})
df['date'] = pd.to_datetime(df)
df

Unnamed: 0,year,month,day,date
0,2015,2,4,2015-02-04
1,2016,3,5,2016-03-05


In [38]:
## get year, month and day
df = pd.DataFrame({'name': ['Tom', 'Andy', 'Lucas'],
                  'DoB': ['08-05-1997', '04-28-1996', '12-16-1995']})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    3 non-null      object
 1   DoB     3 non-null      object
dtypes: object(2)
memory usage: 176.0+ bytes


In [39]:
df['DoB'] = pd.to_datetime(df['DoB'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   name    3 non-null      object        
 1   DoB     3 non-null      datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 176.0+ bytes


In [40]:
df['year'] = df['DoB'].dt.year
df['month'] = df['DoB'].dt.month
df['day'] = df['DoB'].dt.day
df

Unnamed: 0,name,DoB,year,month,day
0,Tom,1997-08-05,1997,8,5
1,Andy,1996-04-28,1996,4,28
2,Lucas,1995-12-16,1995,12,16


In [41]:
## get the week,day of the week and leap year
df['week_of_year'] = df['DoB'].dt.week
df['day_of_week'] = df['DoB'].dt.dayofweek
df['is_leap_year'] = df['DoB'].dt.is_leap_year
df

  df['week_of_year'] = df['DoB'].dt.week


Unnamed: 0,name,DoB,year,month,day,week_of_year,day_of_week,is_leap_year
0,Tom,1997-08-05,1997,8,5,32,1,False
1,Andy,1996-04-28,1996,4,28,17,6,True
2,Lucas,1995-12-16,1995,12,16,50,5,False


In [42]:
#mapping the day_of_week ccolumn to the day of the week name
dw_mapping={
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}
df['day_of_week_name'] = df['DoB'].dt.weekday.map(dw_mapping)
df

Unnamed: 0,name,DoB,year,month,day,week_of_year,day_of_week,is_leap_year,day_of_week_name
0,Tom,1997-08-05,1997,8,5,32,1,False,Tuesday
1,Andy,1996-04-28,1996,4,28,17,6,True,Sunday
2,Lucas,1995-12-16,1995,12,16,50,5,False,Saturday


In [43]:
# Get the age based on date of birth
today = pd.to_datetime('today')
df['age'] = today.year - df['DoB'].dt.year
df

Unnamed: 0,name,DoB,year,month,day,week_of_year,day_of_week,is_leap_year,day_of_week_name,age
0,Tom,1997-08-05,1997,8,5,32,1,False,Tuesday,26
1,Andy,1996-04-28,1996,4,28,17,6,True,Sunday,27
2,Lucas,1995-12-16,1995,12,16,50,5,False,Saturday,28


In [44]:
# Considering the months
today = pd.to_datetime('today')
diff_y = today.year - df['DoB'].dt.year

b_md = df['DoB'].apply(lambda x: (x.month,x.day)) # extracts the month and day from the df
no_birthday = b_md > (today.month,today.day)

df['age'] = diff_y - no_birthday
df

Unnamed: 0,name,DoB,year,month,day,week_of_year,day_of_week,is_leap_year,day_of_week_name,age
0,Tom,1997-08-05,1997,8,5,32,1,False,Tuesday,25
1,Andy,1996-04-28,1996,4,28,17,6,True,Sunday,26
2,Lucas,1995-12-16,1995,12,16,50,5,False,Saturday,27


In [None]:
## setting date column to datetime fromat when reading the file
df = pd.read_csv('data/city_sales.csv',parse_dates=[date])

### Working with Different File formats

In [46]:
# JSON
import json

In [47]:
data = {
    "president": {
        "name": "Zaphod Beeblebrox",
        "species": "Betelgeusian"
    }
}

In [48]:
with open("data_file.json", "w") as write_file:
    json.dump(data, write_file)

In [49]:
#read json as dataframe in pandas
jsonStr = '''{"Index0":{"Courses": "Pandas","Discount": "1200"},
           "Index1":{"Courses": "Hadoop","Discount": "1500"},
           "Index2":{"Courses": "Spark","Discount": "1800"}
          }'''
df2 = pd.read_json(jsonStr, orient = 'index')
print(df2)

       Courses  Discount
Index0  Pandas      1200
Index1  Hadoop      1500
Index2   Spark      1800


In [50]:
# converting dictionanry to df
data['president']

{'name': 'Zaphod Beeblebrox', 'species': 'Betelgeusian'}

In [51]:
df3 = pd.DataFrame.from_dict(data, orient = 'index')
df3

Unnamed: 0,name,species
president,Zaphod Beeblebrox,Betelgeusian


## Working with Excel Files

In [52]:
#read excel file as DF in pandas
excel_df = pd.read_excel('D:/data/sample-xlsx-file.xlsx')

In [53]:
excel_df

Unnamed: 0,Name,Email,Date Of Birth,Salary,Department
0,Rajeev Singh,rajeev@example.com,1992-07-21,1500000,Software Engineering
1,John Doe,john@example.com,1965-01-13,1300000,Sales
2,Jack Sparrow,jack@example.com,1986-12-19,1000000,HR
3,Steven Cook,steven@example.com,1994-05-04,1200000,Marketing


### Working with Avro Files

In [54]:
pip install avro-python3

Note: you may need to restart the kernel to use updated packages.


In [55]:
import copy
import json
import avro
from avro.datafile import DataFileWriter, DataFileReader
from avro.io import DatumWriter, DatumReader

In [57]:
schema = {
    'name': 'avro.example.User',
    'type': 'record',
    'fields': [
        {'name': 'name', 'type': 'string'},
        {'name': 'age', 'type': 'int'}
    ]
}
# parse the schema so we can use it to write the data
schema_parsed = avro.schema.Parse(json.dumps(schema))

In [58]:
schema_parsed

<avro.schema.RecordSchema at 0x2063c0fd240>

In [None]:
# Write data to an avro file
with open('users.avro', 'wb') as f:
    writer = DataFileWriter(f, DatumWriter(), schema_parsed)
    writer.append({'name': 'Pierre-Simon Laplace', 'age': 77})
    writer.append({'name': 'John von Neumann', 'age': 53})
    writer.close()
    
print(f'Schema that we specified:\n {schema}')
print(f'Schema that we parsed:\n {schema_parsed}')
print(f'Schema from users.avro file:\n {schema_from_file}')
print(f'Users:\n {users}')

### Reading Avro Using Pandas

In [64]:
pip install pandavro

Collecting pandavro
  Downloading pandavro-1.7.1.tar.gz (8.1 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting fastavro==1.5.1
  Downloading fastavro-1.5.1-cp310-cp310-win_amd64.whl (425 kB)
     ------------------------------------ 425.0/425.0 kB 379.2 kB/s eta 0:00:00
Building wheels for collected packages: pandavro
  Building wheel for pandavro (setup.py): started
  Building wheel for pandavro (setup.py): finished with status 'done'
  Created wheel for pandavro: filename=pandavro-1.7.1-py3-none-any.whl size=5670 sha256=2b06448ebdb9987318d4a7f444aef553bfe2414f1e020fcce96291a604000358
  Stored in directory: c:\users\dell\appdata\local\pip\cache\wheels\ee\af\a8\f7446fd1c6102f312ccfd1378e8413fc1111769a6bd34998da
Successfully built pandavro
Installing collected packages: fastavro, pandavro
Successfully installed fastavro-1.5.1 pandavro-1.7.1
Note: you may need to restart the kernel to use updated packages.


In [65]:
import copy
import json
import pandas as pd
import pandavro as pdx
from avro.datafile import DataFileReader
from avro.io import DatumReader

In [66]:
# Data to be saved
users = [{'name': 'Pierre-Simon Laplace', 'age': 77},
         {'name': 'John von Neumann', 'age': 53}]
users_df = pd.DataFrame.from_records(users)
print(users_df)

                   name  age
0  Pierre-Simon Laplace   77
1      John von Neumann   53


In [None]:
pdx.to_avro('data/users_test.avro', users_df)

In [None]:
# Read the data back
users_df_redux = pdx.from_avro('data/users_test.avro')
print(type(users_df_redux))

In [None]:
# Check the schema for "users.avro"
with open('users.avro', 'rb') as f:
    reader = DataFileReader(f, DatumReader())
    metadata = copy.deepcopy(reader.meta)
    schema_from_file = json.loads(metadata['avro.schema'])
    reader.close()
print(schema_from_file)