# Importing 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib
from sklearn.ensemble import IsolationForest

# Affluent Achievers

In [None]:
# import combined file
AA_daily = pd.read_csv(r"\AA_blocks_combined_daily.csv")

In [None]:
# remove rows that have null values
AA_daily.dropna()

In [None]:
# rounding energy_sum column to 2 decimal places
AA_daily.energy_sum = AA_daily.energy_sum.round(2)

In [None]:
# checking what the data looks like
AA_daily.head(5)

In [None]:
len(AA_daily)

In [None]:
# descriptive stats of energy sum column
AA_daily["energy_sum"].describe()

In [None]:
# skewness and kurtosis values of energy sum column
print ("Skew: %f"%AA_daily["energy_sum"].skew())
print ("Kurt: %f"%AA_daily["energy_sum"].kurt())

In [None]:
# removes rows that have values that are NaN, infinity or are too large 
AA_daily =AA_daily[~AA_daily.isin([np.nan, np.inf, -np.inf]).any(1)]

In [None]:
# running the Isolation Forest algorithm on the energy_sum column
aa_ad=IsolationForest(n_estimators=60, max_samples='auto', contamination=float(0.1),max_features=1.0)
aa_ad.fit(AA_daily[['energy_sum']])

In [None]:
# adding score and anomaly columns to the dataframe
# if the anomaly column has a 1 that means the row is not an anomaly, if it's a -1 that means it's an anomaly
AA_daily['Score']=aa_ad.decision_function(AA_daily[['energy_sum']])
AA_daily['Anomaly']=aa_ad.predict(AA_daily[['energy_sum']])
AA_daily.head(10)

In [None]:
# storing the anomalies in a new dataframe
anomalies=AA_daily.loc[AA_daily['Anomaly']==-1]
anomalies.head(5)

In [None]:
# counting how many anomalies there are
len(anomalies)

In [None]:
# sorting the new dataframe in descending order
anomalies.sort_values(by=["energy_sum"], ascending=False)

In [None]:
# percentage of data that are anomalies
print("Percentage of data that are anomalies:",round(len(anomalies)/len(AA_daily)*100,1),"%")

In [None]:
anomalies.to_csv(r'\AA_anomalies.csv', index = False)

# Rising Prosperity

In [None]:
RP_daily = pd.read_csv(r"\RP_blocks_combined_daily.csv")

In [None]:
RP_daily.dropna()

In [None]:
RP_daily.energy_sum = RP_daily.energy_sum.round(2)

In [None]:
RP_daily.head(5)

In [None]:
len(RP_daily)

In [None]:
RP_daily["energy_sum"].describe()

In [None]:
print ("Skew: %f"%RP_daily["energy_sum"].skew())
print ("Kurt: %f"%RP_daily["energy_sum"].kurt())

In [None]:
RP_daily =RP_daily[~RP_daily.isin([np.nan, np.inf, -np.inf]).any(1)]

In [None]:
rp_ad=IsolationForest(n_estimators=60, max_samples='auto', contamination=float(0.1),max_features=1.0)
rp_ad.fit(AA_daily[['energy_sum']])

In [None]:
RP_daily['Score']=aa_ad.decision_function(RP_daily[['energy_sum']])
RP_daily['Anomaly']=aa_ad.predict(RP_daily[['energy_sum']])
RP_daily.head(10)

In [None]:
len(RP_daily)

In [None]:
anomalies2=RP_daily.loc[RP_daily['Anomaly']==-1]
anomaly_index=list(anomalies2.index)
anomalies2.head(5)

In [None]:
len(anomalies2)

In [None]:
anomalies2.sort_values(by=["energy_sum"], ascending=False)

In [None]:
date2=RP_daily.loc[RP_daily['day']=='03/11/2013']

In [None]:
date2.energy_sum

In [None]:
print("Percentage of data that are anomalies:",round(len(anomalies2)/len(RP_daily)*100,2),"%")

In [None]:
anomalies2.to_csv(r'\RP_anomalies.csv', index = False)