In [31]:
import os
import shutil
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


In [39]:
# Import data
data_path = os.path.join("..", 'data', 'data_removeMissingValues.csv')
df = pd.read_csv(data_path)
print(df.head())

  municipality  year  treat  newcomers     rents  popu
0       adachi  2008      0    28533.0  281500.0   484
1       adachi  2009      0    31389.0  275200.0   528
2       adachi  2010      0    27831.0  263500.0   459
3       adachi  2011      0    27978.0  262100.0   447
4       adachi  2012      0    27850.0  262800.0   423


In [41]:
# Filter observations - keep only shibuya
df_filtered1 = df[df['municipality'] == "shibuya"]
print(df_filtered1.head())
# Filter observations - keep only data after 2010
df_filtered2 = df[df['year'] >=2010]
print(df_filtered2.head())

# Filter variables
df_filtered3 = df[['municipality', 'year', 'newcomers']]
print(df_filtered3.head())

# Transform variables - create a new column with new values
df_filtered2['popu'].replace(0, np.nan, inplace=True)
df_filtered2['rents_per_capita'] = df_filtered2['rents'] / df_filtered2['popu']
print(df_filtered1.head())


    municipality  year  treat  newcomers      rents  popu
202      shibuya  2008      1    18507.0  1191300.0   469
203      shibuya  2009      1    19661.0  1042300.0   472
204      shibuya  2010      1    19652.0   949000.0   457
205      shibuya  2011      1    20709.0   980600.0   421
206      shibuya  2012      1    20881.0   964200.0   374
  municipality  year  treat  newcomers     rents  popu
2       adachi  2010      0    27831.0  263500.0   459
3       adachi  2011      0    27978.0  262100.0   447
4       adachi  2012      0    27850.0  262800.0   423
5       adachi  2013      0    27124.0  254800.0   436
6       adachi  2016      0    28195.0  264300.0   364
  municipality  year  newcomers
0       adachi  2008    28533.0
1       adachi  2009    31389.0
2       adachi  2010    27831.0
3       adachi  2011    27978.0
4       adachi  2012    27850.0
    municipality  year  treat  newcomers      rents  popu
202      shibuya  2008      1    18507.0  1191300.0   469
203      shibu

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered2['popu'].replace(0, np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered2['popu'].replace(0, np.nan, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered2['rents_per_capita'] = 

In [43]:
# Save data (Question 7)
data_folder_path = os.path.join("..", 'data', 'data_filtered_by_observation.csv')
df_filtered1.to_csv(data_folder_path, index=False)
print(df_filtered1.head())

data_folder_path = os.path.join("..", 'data', 'data_filtered_by_variables.csv')
df_filtered3.to_csv(data_folder_path, index=False)
print(df_filtered3.head())

data_folder_path = os.path.join("..", 'data', 'data_filtered_with_transformation.csv')
df_filtered2.to_csv(data_folder_path, index=False)
print(df_filtered2.head())


    municipality  year  treat  newcomers      rents  popu
202      shibuya  2008      1    18507.0  1191300.0   469
203      shibuya  2009      1    19661.0  1042300.0   472
204      shibuya  2010      1    19652.0   949000.0   457
205      shibuya  2011      1    20709.0   980600.0   421
206      shibuya  2012      1    20881.0   964200.0   374
  municipality  year  newcomers
0       adachi  2008    28533.0
1       adachi  2009    31389.0
2       adachi  2010    27831.0
3       adachi  2011    27978.0
4       adachi  2012    27850.0
  municipality  year  treat  newcomers     rents   popu  rents_per_capita
2       adachi  2010      0    27831.0  263500.0  459.0        574.074074
3       adachi  2011      0    27978.0  262100.0  447.0        586.353468
4       adachi  2012      0    27850.0  262800.0  423.0        621.276596
5       adachi  2013      0    27124.0  254800.0  436.0        584.403670
6       adachi  2016      0    28195.0  264300.0  364.0        726.098901


In [45]:
# Perform operation in files in different folders - save this notebook into script folder
main_folder = 'coe_final_assignment'
target_folder = os.path.join("..", 'scripts_stata_python')
current_notebook_path = os.path.abspath("Question6&7_sample_for_analysis_by_filtering_observations.ipynb")
target_notebook_path = os.path.join(target_folder, os.path.basename(current_notebook_path))
shutil.move(current_notebook_path, target_notebook_path)
print(f"Notebook moved to {target_notebook_path}")

Notebook moved to ../scripts_stata_python/Question6&7_sample_for_analysis_by_filtering_observations.ipynb
