In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [24]:
# read file 
df_std = pd.read_csv('csv_files/p2no_skew.csv', index_col=0)
df_std.head()

Unnamed: 0,C1,C3,C4,C5,C6,C7,T1,T2,T3,T4,...,S3,C3p,C5p,C2,LongSentences,RealWords,LongWords,PositiveWords,NegativeWords,UncertainWords
0,2.227033,1.4884,0.029074,17.528157,16.176365,1.927893,21.679483,112.778544,107.51744,17.349352,...,2.275984,0,1.351792,1.0,0.640426,0.646515,0.61527,0.169556,0.209529,0.221915
1,2.52474,0.963174,-0.013352,17.178286,14.690979,1.720483,28.124722,147.621137,139.946418,22.58318,...,2.487662,1,2.487306,0.0,0.644753,0.641182,0.609752,0.148247,0.223103,0.221556
2,2.116933,0.806476,0.020715,16.639604,15.424948,1.395264,14.177447,72.539644,69.173694,11.313708,...,1.978602,1,1.214656,1.0,0.636816,0.646763,0.628717,0.226322,0.184411,0.225078
3,2.436077,0.647103,0.020023,16.513419,15.319588,1.429303,18.11077,90.87904,87.028731,13.304135,...,2.188931,1,1.193831,1.0,0.539634,0.650793,0.62849,0.213167,0.192593,0.23704
4,2.075782,0.067659,-0.034895,17.241131,15.925724,2.929681,23.916521,121.778487,114.786759,18.330303,...,2.346674,0,1.315407,1.0,0.587413,0.635767,0.603993,0.165315,0.215782,0.224143


In [25]:
#Creating a new df withe just Column 'C1' in it 
my_col = df_std.C1
my_col.head()

0    2.227033
1    2.524740
2    2.116933
3    2.436077
4    2.075782
Name: C1, dtype: float64

In [26]:
#Calculating the upper bound (> 3 standard deviations) for Column 'C1'
u2_bound = my_col.mean() + 3 * my_col.std()
u2_bound

2.975333116663967

In [27]:
# detecting any values are greater than the upper boundary
my_col[my_col>u2_bound]

65     3.574816
141    3.067427
228    3.089133
391    3.316625
413    3.004782
628    3.063733
665    3.289497
Name: C1, dtype: float64

In [28]:
#Replace the values greater than the upper boundary with the upper boundary
my_col[my_col>u2_bound] = u2_bound

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [29]:
# verifying there is no value greater than the upper boundary
pd.Series(my_col>u2_bound).value_counts()

False    682
Name: C1, dtype: int64

In [30]:
#Calculating the lower bound (< 3 standard deviations) for Column 'C1'
l2_bound = my_col.mean() - 3 * my_col.std()
l2_bound

1.5019178597752965

In [31]:
# detecting any values are less than the lower boundary
my_col[my_col<l2_bound]

148    1.491301
575    1.467799
589    1.467799
Name: C1, dtype: float64

In [32]:
#Replace the values lower than the lower boundary with the lower boundary
my_col[my_col<l2_bound] = l2_bound

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [33]:
# verifying there is no value less than the lower boundary
pd.Series(my_col<l2_bound).value_counts()

False    682
Name: C1, dtype: int64

In [34]:
# going for all the columns in the DataFrame
for col in df_std.columns:
    u_bound = df_std[col].mean() + 3 * df_std[col].std()
    l_bound = df_std[col].mean() - 3 * df_std[col].std()
    df_std[col][df_std[col] > u_bound] = u_bound
    df_std[col][df_std[col] < l_bound] = l_bound

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [35]:
#Trying to see if there are any outliers left in the DataFrame
outlier = (df_std < l_bound) | (df_std > u_bound)
print(outlier)

       C1     C3    C4    C5    C6    C7    T1    T2    T3    T4  \
0    True   True  True  True  True  True  True  True  True  True   
1    True   True  True  True  True  True  True  True  True  True   
2    True   True  True  True  True  True  True  True  True  True   
3    True   True  True  True  True  True  True  True  True  True   
4    True   True  True  True  True  True  True  True  True  True   
5    True   True  True  True  True  True  True  True  True  True   
6    True   True  True  True  True  True  True  True  True  True   
7    True   True  True  True  True  True  True  True  True  True   
8    True   True  True  True  True  True  True  True  True  True   
9    True   True  True  True  True  True  True  True  True  True   
10   True   True  True  True  True  True  True  True  True  True   
11   True   True  True  True  True  True  True  True  True  True   
12   True  False  True  True  True  True  True  True  True  True   
13   True   True  True  True  True  True  True  

In [37]:
# going for all the columns in the DataFrame
u_bound = df_std[col].mean() + 3 * df_std[col].std()
l_bound = df_std[col].mean() - 3 * df_std[col].std()
for col in df_std.columns:
    u_bound = df_std[col].mean() + 3 * df_std[col].std()
    l_bound = df_std[col].mean() - 3 * df_std[col].std()
    if df_std[col] > u_bound:
        return u_bound
    elif df_std[col] < l_bound:
        return l_bound
    else:
        pass

SyntaxError: 'return' outside function (<ipython-input-37-a7546a99cbeb>, line 8)