<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Analysis-of-selected-funds" data-toc-modified-id="Analysis-of-selected-funds-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Analysis of selected funds</a></span><ul class="toc-item"><li><span><a href="#Number-of-unique-port_nos-and-fund_nos" data-toc-modified-id="Number-of-unique-port_nos-and-fund_nos-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Number of unique port_nos and fund_nos</a></span></li><li><span><a href="#Number-of-unique-funds-per-year" data-toc-modified-id="Number-of-unique-funds-per-year-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Number of unique funds per year</a></span></li><li><span><a href="#Distribution-by-number-of-observations-per-fund" data-toc-modified-id="Distribution-by-number-of-observations-per-fund-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Distribution by number of observations per fund</a></span></li><li><span><a href="#Take-only-the-end-of-the-year-observation-per-fund" data-toc-modified-id="Take-only-the-end-of-the-year-observation-per-fund-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Take only the end of the year observation per fund</a></span></li><li><span><a href="#Distribution-by-lipper_class" data-toc-modified-id="Distribution-by-lipper_class-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Distribution by lipper_class</a></span></li><li><span><a href="#How-often-do-lipper_classes-change?" data-toc-modified-id="How-often-do-lipper_classes-change?-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>How often do lipper_classes change?</a></span></li></ul></li></ul></div>

# Analysis of selected funds

Main insights:
    - ~3700 unique funds over the complete timeframe
    - Strong break in 2010 (few observations before)
    - ~2500 unique funds per year
    - ~6% of funds change their lipper class over the complete timeframe



In [None]:
import feather

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Convert to date format
year = 2016

In [None]:
path = '../data/processed/row_info.feather'
info_df = feather.read_dataframe(path)

In [None]:
#info_df = info_df.query('year == @year')

In [None]:
info_df.sample()

## Number of unique port_nos and fund_nos

In [None]:
info_df['report_dt'].describe()

In [None]:
print(info_df['crsp_portno'].nunique())
print(info_df['crsp_fundno'].nunique())

## Number of unique funds per year

#### Something seems odd
Break in 2009 not clear

In [None]:
# Analysis of selected funds
sns.set()
## Distribution by date(info_df["report_dt"]
(info_df["crsp_portno"]
    .groupby([
         info_df["report_dt"].dt.year,
         info_df["report_dt"].dt.month
     ])
     .nunique()
     .plot(kind="bar",
           color = 'b',
           figsize=(10,5)))

## Distribution by number of observations per fund 

In [None]:
(info_df["crsp_portno"]
     .groupby([
         info_df["crsp_portno"],
     ])
     .count()
     .plot(kind="hist",
           bins = 10,
           color = 'b',
           figsize=(10,5)))

## Take only the end of the year observation per fund

In [None]:
index = (info_df
     .assign(year = info_df["report_dt"].dt.year)
     .groupby(['crsp_portno','year'])
     [['crsp_portno','report_dt']]
     .tail(1)
     .index)

In [None]:
info_df = info_df.loc[index]

## Distribution by lipper_class

In [None]:
(info_df['crsp_portno']
     .groupby([
         info_df["lipper_class"],
     ])
     .count()
#     .sort_values()
     .plot(kind="barh",
           color = 'b',
           figsize=(20,5)))

In [None]:
(info_df['crsp_portno']
     .groupby([
         info_df['cap_class'],
     ])
     .count()
     .plot(kind='barh',
           color = 'b',
           figsize=(10,5)))

In [None]:
(info_df['crsp_portno']
     .groupby([
         info_df['style_class'],
     ])
     .count()
     .plot(kind='barh',
           color = 'b',
           figsize=(10,5)))

## How often do lipper_classes change?


In [None]:
obj_per_portno = info_df[['crsp_portno','lipper_class']]
ax = (obj_per_portno['lipper_class']
    .groupby([
        obj_per_portno["crsp_portno"]
    ])
    .nunique()
    .value_counts()
    .sort_values()
    .plot(kind='barh',
          color = 'b',
          figsize=(18,5),
          title='Number of unique objective codes per crsp_portno')
)

In [None]:
lagged_class = info_df[['crsp_portno','report_dt','lipper_class']]

lagged_class = lagged_class.assign(lag_lipper_class = lagged_class.lipper_class.shift())
lagged_class = lagged_class.dropna()

mask = lagged_class.groupby(by = 'crsp_portno').head(1).index
lagged_class = lagged_class[~lagged_class.index.isin(mask)]

round(pd.crosstab(lagged_class.lipper_class, lagged_class.lag_lipper_class, normalize = 'all') * 100,2)

In [None]:
lagged_class = info_df[['crsp_portno','report_dt','style_class']]

lagged_class = lagged_class.assign(lag_style_class = lagged_class.style_class.shift())
lagged_class = lagged_class.dropna()

mask = lagged_class.groupby(by = 'crsp_portno').head(1).index
lagged_class = lagged_class[~lagged_class.index.isin(mask)]

pd.crosstab(lagged_class.style_class, lagged_class.lag_style_class)

In [None]:
lagged_class = info_df[['crsp_portno','report_dt','cap_class']]

lagged_class = lagged_class.assign(lag_cap_class = lagged_class.cap_class.shift())
lagged_class = lagged_class.dropna()

mask = lagged_class.groupby(by = 'crsp_portno').head(1).index
lagged_class = lagged_class[~lagged_class.index.isin(mask)]

round(pd.crosstab(lagged_class.cap_class, lagged_class.lag_cap_class, margins = True, normalize= 'all') *100,2)

In [None]:
pd.crosstab(lagged_class.cap_class, lagged_class.lag_cap_class, margins = True)

### Fund names