In [None]:
!pip install panelsplit

In [None]:
import pandas as pd
import numpy as np
from panelsplit.cross_validation import PanelSplit
from typing import Union
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
class FeatureEngineer:

    """
    This is a class that contains general methods that can be applied to a DataFrame to create new features. Examples of such methods include creating lagged variables, rolling min/mean/max/sum and weighted rolling mean/sum.
    The methods in this class are designed to be used in a pipeline to create new features for a given DataFrame.

    Attributes:
    -----------
    groupby_cols : Union[str, list]
        A str or list of columns to group by

    Methods:
    --------

    lag(input_df:pd.DataFrame, y_col:str, lags:list):
        This is a method that creates lagged variables for a given column in a DataFrame.

    rolling_sum(input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):
        This is a method that creates the rolling sum of specified windows for a given column in a DataFrame.

    rolling_mean(input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):
        This is a method that creates the rolling mean of specified windows for a given column in a DataFrame.

    rolling_min(input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):
        This is a method that creates the rolling min of specified windows for a given column in a DataFrame.

    rolling_max(input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):
        This is a method that creates the rolling max of specified windows for a given column in a DataFrame.

    rolling_std(input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):
        This is a method that creates the rolling standard deviation of specified windows for a given column in a DataFrame.

    create_exponential_weights(window_size, alpha=0.8):
        This is a method that enables generating "rolling" exponential weights for a given window size.

    weighted_rolling_sum(input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False, alpha = 0.8):
        This is a method that creates the weighted rolling sum of specified windows for a given column in a DataFrame.

    weighted_rolling_mean(input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False, alpha = 0.8):
        This is a method that creates the weighted rolling mean of specified windows for a given column in a DataFrame.

    count_since(input_df:pd.DataFrame, y_col:str, thresholds:list, shift_knowledge:int = None):
        This is a method that counts the number of periods since a variable has been above a given threshold.

    ongoing(input_df:pd.DataFrame, y_col:str, thresholds:list, shift_knowledge:int = None):
        This is a method that represents a sequential count of the number of periods for which a variable has been above a given threshold.

    Notes:
    -------
    Be very careful with NAs when using the count_since_thresh and ongoing_episode methods.
    The way we are computing things here (i.e. using a > th condition) means they are treated as a 0/False.
    """

    def __init__(self, groupby_cols: Union[str, list]):

        self.groupby_cols = groupby_cols

    def _index_check(self, df:pd.DataFrame):

        """
        This is a method that checks if the index of a DataFrame is sorted correctly.

        Args:
        -----
        :param df: The DataFrame to check.

        Returns:
        --------
        :return: The DataFrame with a sorted index.
        """

        assert df.index.is_monotonic_increasing, "The index of the DataFrame should be monotonically increasing."

    def lag(self, input_df:pd.DataFrame, y_col:str, lags:list):

        """
        This is a method that creates lagged variables for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create lagged variables.
        :param lags: A list of lag values to create.

        Returns:
        --------
        :return: The original DataFrame with the lagged variables appended.
        """
        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_basic_lag' + str(lag) for lag in lags]
        for idx, lag in enumerate(lags):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].shift(lag)
        return df

    def rolling_sum(self, input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):

        """
        This is a method that creates the rolling sum of specified windows for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create lagged variables.
        :param groupby_cols: A list of columns to group by.
        :param windows: A list of windows to generate a rolling sum for.
        :param closed: A string indicating the side of the window interval to close on. Closed = 'left' omits the current observation.
        :param return_logs: A boolean indicating whether to return the log of the rolling sum.

        Returns:
        --------
        :return: The original DataFrame with the rolling sum variables appended.

        """

        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_rolling_sum' + str(w) for w in windows]

        for idx, w in enumerate(windows):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].rolling(w, min_periods=1, closed = closed).sum().values
            if return_logs:
                df['ln_' + col_names[idx]] = np.log1p(df[col_names[idx]])
                df = df.drop(col_names[idx], axis = 1)
        return df

    def rolling_mean(self, input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):

        """
        This is a method that creates the rolling mean of specified windows for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create rolling variables.
        :param windows: A list of windows to generate a rolling mean for.
        :param closed: A string indicating the side of the window interval to close on. Closed = 'left' omits the current observation.
        :param return_logs: A boolean indicating whether to return the log of the rolling mean.

        Returns:
        --------
        :return: The original DataFrame with the rolling mean variables appended.

        """

        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_rolling_mean' + str(w) for w in windows]

        for idx, w in enumerate(windows):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].rolling(w, min_periods=1, closed = closed).mean().values
            if return_logs:
                df['ln_' + col_names[idx]] = np.log1p(df[col_names[idx]])
                df = df.drop(col_names[idx], axis = 1)
        return df

    def rolling_min(self, input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):

        """
        This is a method that creates the rolling min of specified windows for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create rolling variables.
        :param windows: A list of windows to generate a rolling min for.
        :param closed: A string indicating the side of the window interval to close on. Closed = 'left' omits the current observation.
        :param return_logs: A boolean indicating whether to return the log of the rolling min.

        Returns:
        --------
        :return: The original DataFrame with the rolling min variables appended.

        """

        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_rolling_min' + str(w) for w in windows]

        for idx, w in enumerate(windows):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].rolling(w, min_periods=1, closed = closed).min().values
            if return_logs:
                df['ln_' + col_names[idx]] = np.log1p(df[col_names[idx]])
                df = df.drop(col_names[idx], axis = 1)
        return df

    def rolling_max(self, input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):

        """
        This is a method that creates the rolling max of specified windows for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create rolling variables.
        :param windows: A list of windows to generate a rolling max for.
        :param closed: A string indicating the side of the window interval to close on. Closed = 'left' omits the current observation.
        :param return_logs: A boolean indicating whether to return the log of the rolling max.

        Returns:
        --------
        :return: The original DataFrame with the rolling max variables appended.

        """

        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_rolling_max' + str(w) for w in windows]

        for idx, w in enumerate(windows):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].rolling(w, min_periods=1, closed = closed).max().values
            if return_logs:
                df['ln_' + col_names[idx]] = np.log1p(df[col_names[idx]])
                df = df.drop(col_names[idx], axis = 1)
        return df

    def rolling_std(self, input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False):

        """
        This is a method that creates the rolling standard deviation of specified windows for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create rolling variables.
        :param windows: A list of windows to generate a rolling standard deviation for.
        :param closed: A string indicating the side of the window interval to close on. Closed = 'left' omits the current observation.
        :param return_logs: A boolean indicating whether to return the log of the rolling standard deviation.

        Returns:
        --------
        :return: The original DataFrame with the rolling standard deviation variables appended.

        """

        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_rolling_std' + str(w) for w in windows]

        for idx, w in enumerate(windows):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].rolling(w, min_periods=1, closed = closed).std().values
            if return_logs:
                df['ln_' + col_names[idx]] = np.log1p(df[col_names[idx]])
                df = df.drop(col_names[idx], axis = 1)
        return df

    def _create_exponential_weights(self, window_size, alpha=0.8):

        """
        This is a method that enables generating "rolling" exponential weights for a given window size.

        Args:
        -----
        :param window_size: The size of the window for which weights are calculated.
        :param alpha: The decay factor for weights, defaults to 0.5.
                    A higher alpha discounts older observations faster.

        Returns:
        -----
        :return: A numpy array of weights.
        """

        weights = alpha ** np.arange(window_size)
        normalized_weights = weights / weights.sum()
        return normalized_weights[::-1]

    def weighted_rolling_sum(self, input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False, alpha = 0.8):

        """
        This is a method that creates the weighted rolling sum of specified windows for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create weighted rolling variables.
        :param groupby_cols: A list of columns to group by.
        :param windows: A list of windows to generate a weighted rolling sum for.
        :param closed: A string indicating the side of the window interval to close on. Closed = 'left' omits the current observation.
        :param return_logs:  A boolean indicating whether to return the log of the weighted rolling sum.
        :param alpha: The decay factor for weights, defaults to 0.8. A higher alpha discounts older observations faster.

        Returns:
        -----
        :return: The original DataFrame with the weighted rolling sum variables appended.
        """

        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_weighted_rolling_sum' + str(w) for w in windows]

        for idx, w in enumerate(windows):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].rolling(w, min_periods=1, closed = closed).apply(lambda x: np.sum(self._create_exponential_weights(len(x), alpha) * x), raw = True).values
            if return_logs:
                df['ln_' + col_names[idx]] = np.log1p(df[col_names[idx]])
                df = df.drop(col_names[idx], axis = 1)
        return df

    def weighted_rolling_mean(self, input_df:pd.DataFrame, y_col:str, windows:list, closed = None, return_logs = False, alpha = 0.8):
        """
        This is a method that creates the weighted rolling mean of specified windows for a given column in a DataFrame.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create weighted rolling variables.
        :param windows: A list of windows to generate a weighted rolling mean for.
        :param closed: A string indicating the side of the window interval to close on. Closed = 'left' omits the current observation.
        :param return_logs:  A boolean indicating whether to return the log of the weighted rolling mean.
        :param alpha: The decay factor for weights, defaults to 0.8. A higher alpha discounts older observations faster.

        Returns:
        --------
        :return: The original DataFrame with the weighted rolling mean variables appended.
        """

        df = input_df.copy()

        self._index_check(df)

        col_names = [y_col + '_weighted_rolling_mean' + str(w) for w in windows]

        for idx, w in enumerate(windows):
            df[col_names[idx]] = df.groupby(self.groupby_cols)[y_col].rolling(w, min_periods=1, closed = closed).apply(lambda x: np.sum(self._create_exponential_weights(len(x), alpha) * x) / len(x), raw = True).values
            if return_logs:
                df['ln_' + col_names[idx]] = np.log1p(df[col_names[idx]])
                df = df.drop(col_names[idx], axis = 1)
        return df

    def _count_since(self, x: pd.Series):
        """
        This is a method that counts the number of periods since a variable has been 1.

        :param x: A pandas Series containing the target variable.

        Returns:
        - y (list): A list containing the number of periods since the target variable has been 1.
        """

        x = list(x)
        y = []
        for n in range(0, len(x)):
            if (x[n] == 0) & (n == 0):
                y.append(1) # if it starts with no flows
            elif x[n] == 1:
                y.append(0) # reset to 0 if flows
            else:
                y.append(y[n-1]+1) # add 1 if no flows
        return y

    def since(self, input_df:pd.DataFrame, y_col:str, thresholds:list, shift_knowledge:int = None):

        """
        This is a method that counts the number of periods since a variable has been above a given threshold.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create the count since variable.
        :param thresholds: A list of thresholds to count since.
        :param shift_knowledge: An integer defining by how many periods to shift the count since variable.

        Returns:
        --------
        :return: The original DataFrame with the count since variables appended.
        """

        df = input_df.copy()

        self._index_check(df)


        binary_col_names = [y_col + '_above' + str(th) for th in thresholds]
        col_names = [y_col + '_since_' + str(th) for th in thresholds]

        for idx, th in enumerate(thresholds):
            df[binary_col_names[idx]] = (df[y_col] > th).astype(int)
            df[col_names[idx]] = df.groupby(self.groupby_cols)[binary_col_names[idx]].transform(self._count_since)

            if shift_knowledge is None:
                pass
            else:
                #in case we need to shift by one since we don't know the y_col in current period
                df[binary_col_names[idx]] = df.groupby(self.groupby_cols)[[binary_col_names[idx]]].shift(shift_knowledge)
                df[col_names[idx]] = df.groupby(self.groupby_cols)[col_names[idx]].shift(shift_knowledge)
        return df[[y_col, *[x for x in df.columns if 'since' in x]]]

    def _count_ongoing(self, x: pd.Series):
        """
        This is a method that generates a sequential count of the periods for which a variable has been 1.

        :param x: A pandas Series containing the target variable.

        Returns:
        - y (list): A list containing the sequential count of the periods for which the target variable has been 1.
        """

        x = list(x)
        y = []
        episode_counter = 0
        for n in range(0, len(x)):
            if (x[n] == 0) & (n == 0):
                y.append(episode_counter) # if it starts with no flows
            elif x[n] == 1:
                episode_counter += 1
                y.append(episode_counter) # if there are flows
            else:
                y.append(0) # reset to 0 if no flows
                episode_counter = 0
        return y

    def ongoing(self, input_df:pd.DataFrame, y_col:str, thresholds:list, shift_knowledge:int = None):

        """
        This is a method that represents a sequential count of the number of periods for which a variable has been above a given threshold.

        Args:
        -----
        :param input_df: The DataFrame containing the data.
        :param y_col: The name of the column for which to create the count since variable.
        :param thresholds: A list of thresholds to count since.
        :param shift_knowledge: An integer defining by how many periods to shift the count since variable.

        Returns:
        --------
        :return: The original DataFrame with the count since variables appended.
        """

        df = input_df.copy()

        self._index_check(df)

        binary_col_names = [y_col + '_above' + str(th) for th in thresholds]
        col_names = [y_col + '_ongoing_' + str(th) for th in thresholds]

        for idx, th in enumerate(thresholds):
            df[binary_col_names[idx]] = (df[y_col] > th).astype(int)
            df[col_names[idx]] = df.groupby(self.groupby_cols)[binary_col_names[idx]].transform(self._count_ongoing)

            if shift_knowledge is None:
                pass
            else:
                #in case we need to shift by one since we don't know the y_col in current period
                df[binary_col_names[idx]] = df.groupby(self.groupby_cols)[[binary_col_names[idx]]].shift(shift_knowledge)
                df[col_names[idx]] = df.groupby(self.groupby_cols)[col_names[idx]].shift(shift_knowledge)
        return df[[y_col, *[x for x in df.columns if 'ongoing' in x]]]


Bonus:
- Instead of directly using my class, it would be good for your learning to try implementing at least some of these operations for yourself. It's the best way to learn!
- Advanced Python users can check out the feature-engine (https://feature-engine.trainindata.com/en/latest/) package. It's a more sophisticated way to develop custom classes for feature engineering.
- Advanced Python users can also check out dataclasses (https://www.datacamp.com/tutorial/python-data-classes). This is a more modern way of building classes in Python. I have been meaning to transition to this, but never found the time...

# (1) Feature engineering for UCDP data

The objective of this section is to introduce classic time-series feature engineering techniques for panel data.

## Load data and initialize feature engineer

In [None]:
#NB - upload the ucdp.csv file into files (toggle on the left-hand side). You will need to do this for any given run of the notebook.
#alternatively upload to your G Drive and mount your drive in this notebook
import os
os.listdir()

In [None]:
#read the data
ucdp = pd.read_csv("ucdp.csv", index_col = 0)

#note: sorting values is CRITICAL when using group by operations
ucdp = ucdp.set_index(['isocode', 'period']).sort_index() #set index to our id_vars and sort values

#drop population for now - we don't need it
ucdp = ucdp.drop(columns = ['population'])

#rename column
ucdp = ucdp.rename(columns = {'fatalities_UCDP': 'violence'})

#display
ucdp

In [None]:
fe = FeatureEngineer(groupby_cols = 'isocode')

#the class is flexible - but for this notebook we will only ever by working with this column for operations
y_col = 'violence'

#build an example condition which we will use throughout
example_iso = 'BFA'
example_periods = np.arange(201601, 201813)
example_cond = (ucdp.index.get_level_values('isocode') == example_iso) & (ucdp.index.get_level_values('period').isin(example_periods))

#display example
ucdp[example_cond]

## Continuous features

### Basic lag

In [None]:
#very simple 1, 3 and 5 month lags
lags = [1, 3, 6]

lag_df = fe.lag(ucdp, y_col, lags)

lag_df[example_cond]

Things you should notice:

- Checking NAs in the dataframe. When do they appear? Why? Hint:

`lag_df[lag_df['violence_basic_lag1'].isna()].reset_index()['period'].unique())`

### Rolling sum

In [None]:
windows = [1, 3, 6]
closed = None
return_logs = False

rolling_sum_df = fe.rolling_sum(ucdp, y_col, windows, closed, return_logs)

rolling_sum_df[example_cond]

Things you should notice:
- I am always setting min_periods = 1 in the class. This means we only require 1 observation to compute any rolling value. If it's not obvious, check the pandas documentation to understand better https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.rolling.html.

Try:
  - Changing `closed = "left"`. How does this affect the rolling computation. In what situations would you want this property?
  - Other rolling operations e.g. mean, min, max.

Bonus:
- In what situations would you transform the feature to log terms? Why? Would it make theoretically any difference for tree-based methods?

### Weighted rolling sum

In [None]:
#here I am showing you the contribution to the sum for each timestep using a window = 3 and alpha = 0.8
alpha = 0.8

fe._create_exponential_weights(window_size = 3, alpha = alpha)

Try:
  - Changing `alpha` and `window_size`. How do the weights change?

In [None]:
windows = [1, 3, 6]
closed = None
return_logs = False
alpha = 0.2

weighted_rolling_sum_df = fe.weighted_rolling_sum(ucdp, y_col, windows, closed, return_logs, alpha)

weighted_rolling_sum_df[example_cond]

Try:
  - Changing `alpha`. How does this affect the rolling computation. Why would you set alpha higher or lower?

## Discrete features

## Since

In [None]:
thresholds = [0, 10, 100]
shift_knowledge = None
since_df = fe.since(ucdp, y_col, thresholds, shift_knowledge)

since_df[example_cond]

## Ongoing

In [None]:
thresholds = [0, 10, 100]
shift_knowledge = None
ongoing_df = fe.ongoing(ucdp, y_col, thresholds, shift_knowledge)

ongoing_df[example_cond]

Try:
- Setting `shift_knowledge = 1` for since/ongoing. How does the feature value change? Does it remind you of a parameter we have for rolling features? When would you use it?





# (2) Feature engineering for LDA topics

In [None]:
raw_topics_lby = pd.read_csv('raw_topics_lby.csv', index_col = 0).sort_values(by = ['isocode', 'period']) #raw topics, only includes LBY - full history (198901 to 202412)
topics = pd.read_csv('topics.csv', index_col = 0).sort_values(by = ['isocode', 'period']) #stock topics, all countries (201001 to 202412)

#subset to example isocode and 201001 to 202412
example_iso = 'LBY'
topics_lby = topics[(topics['isocode'] == example_iso)].sort_values(by = ['isocode', 'period'])
raw_topics_lby = raw_topics_lby[raw_topics_lby['period'].isin(topics_lby['period'].unique())].sort_values(by = ['isocode', 'period'])

#display
display(topics_lby)
display(raw_topics_lby)

In [None]:
#plot tokens vs stock tokens

fig, ax = plt.subplots(figsize=(10, 6), dpi=300)

topic_idx = 2

timesteps = pd.Index(pd.date_range(start="2010-01-01", periods=180, freq="MS").strftime("%Y%m"))
raw_topics_lby['plot_period'] = timesteps
topics_lby['plot_period'] = timesteps

sns.lineplot(data=raw_topics_lby, x='plot_period', y=f'pr_topic_{topic_idx}', label='Raw topic', ax=ax, alpha = 0.6)
sns.lineplot(data=topics_lby, x='plot_period', y=f'stock_topic_{topic_idx}', label='Stock topic', ax=ax)

#set xticks every 12
ax.set_xticks(np.arange(0, 180, 12), labels=timesteps[np.arange(0, 180, 12)], rotation=90)

ax.set_xlabel("")
ax.set_ylabel("Topic share")

Bonus:
- Can you code up your own functions to compute the stock topic share? Use the formulas provided in class as reference. Start with the raw_topics.csv (Libya only), apply your function/s and they should be the same as the stock_topic columns (for Libya) in topics.csv.
- If you can do this for one topic column, one country time series then it would be trivial for you to manage this for all columns, all countries!

# (3) Generate predictions

Below is a quick intro to the panelsplit cross_val_fit_predict method


```
from panelsplit.application import cross_val_fit_predict

preds, fitted_estimators = cross_val_fit_predict(
    estimator=, #your ML model of choice e.g. RandomForestClassifier()
    X=, #your feature dataframe
    y= ,#your target e.g. target_df[target_col]
    cv=, #your cross-validation strategy. All you need is your initialized panelsplit object e.g. PanelSplit(periods, n_splits, test_size, gap)
    method=, #'predict' or 'predict_proba'
    drop_na_in_y= #whether to drop NAs if they are still present in your target
)

Returns
-------

preds: np.array
  numpy array of your predictions
fitted_estimators: list
  a list of fitted models for each split

```

# Task for next session

Using the session_2 and session_3 notebooks, your objective is to generate pseudo-out-of-sample predictions. The specification of the task is as follows:

- **Target variable:** Anyviolence for a 3 month forecasting horizon. This is a a classification task. Please generate predictions for both incidence and onset.
- **Test period:** Your test predictions should start in 202301. You should generate pseudo-out-of-sample predictions for every month (`test_size=1`) up to and including 202412. Remember to set gap correctly.
- **Features**: As a minimum, your feature set should be as follows:

a) Rolling mean of fatalities for window sizes of [1, 3, 12, 36, 60]. Switch this to weighted if you prefer.

b) Number of months since violence exceeed 0 fatalities

c) Periods of consecutive months exceeding 0 fatalities (ongoing)

d) LDA stock topics (you can use topics.csv, these have the stock computation already applied)

As a default please use:



```
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(max_depth=4, max_features=0.2, min_samples_leaf=100, random_state=42)
```

You are free to experiment with additional features/models if you like! Do not invest time in hyperparameter tuning.

**Hints:**

- Check out PanelSplit.gen_test_labels() --> this is a really easy way for you to collect predictions across every split. Make sure you retain the `since` column in your final predictions dataframe - you will need this for evaluation.

```
ps = PanelSplit(periods, n_splits, test_size, gap)
final_preds_df = ps.gen_test_labels(target_df.merge(X['violence_since_0'], left_index=True, right_index=True, how='left')) #this will generate a dataframe where you can save down your predictions. Retains the since column also.
final_preds_df['preds'] = preds[:, 1] #here I am saving the predicted probabilities for class 1 (assumes method is 'predict_proba')
```

- The default method in `cross_val_fit_predict` is `predict`. Since this is a classification task, you will want `predict_proba`.
- Check out the `drop_na_in_y` parameter in the `cross_val_fit_predict` method of `panelsplit.application`. This will avoid:

```
ValueError: Input y contains NaN.
```

Make sure you understand what is going on under the hood. Check the source code!

**Bonus:**

- Run models using only historical violence features and only text features. In the next session you will learn how to evaluate performance. Then you can compare the models you already ran (i.e. all features) with the history-only and text-only features?
- Same as above, but now run for for a 12 month, rather than a 3 month horizon.


