In [1]:
from typing import Tuple
from matplotlib.figure import Figure
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import scipy.stats as sp
import sys

sns.set()



OUTPUT_TEMPLATE = (
    "Initial t-test p-value (invalid):\t\t{initial_ttest_p:.3g}\n"
    "Normality p-value of original data:\t\t{initial_weekday_normality_p:.3g}, {initial_weekend_normality_p:.3g}\n"
    "Equal variance p-value of original data:\t\t{initial_levene_p:.3g}\n"
    "Normality p-value of transformed data:\t\t{transformed_weekday_normality_p:.3g}, {transformed_weekend_normality_p:.3g}\n"
    "Equal variance p-value of transformed data:\t{transformed_levene_p:.3g}\n"
    "Normality p-value of weekly data:\t\t\t{weekly_weekday_normality_p:.3g}, {weekly_weekend_normality_p:.3g}\n"
    "Equal variance p-value of weekly data:\t\t{weekly_levene_p:.3g}\n"
    "Weekly t-test p-value:\t\t\t\t{weekly_ttest_p:.3g}\n"
    "Mann–Whitney U test p-value:\t\t\t{utest_p:.3g}"
)


def read_data(path: str) -> pd.DataFrame:
    # do not modify
    return pd.read_json(path, lines=True) 


def split_data(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    # do not modify
    wd = df.query("is_weekend==False")
    we = df.query("is_weekend==True")
    return wd, we


def draw_histogram(df: pd.DataFrame, title: str = None) -> Figure:
    # do not modify
    fig, ax = plt.subplots(1, 1, dpi=100)
    ret = sns.histplot(data=df, x='comment_count', hue='is_weekend', ax=ax)
    if title:
        ret.set(title=title)
    return fig



In [10]:
path = "/Users/antonioslagarias/IFT6758/hw3-en/data/reddit-counts.json.gz"
#if len(sys.argv) > 1:
#    path = sys.argv[1]

# load data
raw_df = read_data(path)

In [11]:
raw_df

Unnamed: 0,date,subreddit,comment_count
0,2012-02-20,newfoundland,7
1,2015-01-26,Manitoba,1
2,2013-09-07,Yukon,2
3,2014-02-15,saskatchewan,5
4,2014-07-06,canada,1652
...,...,...,...
15465,2012-05-21,Quebec,365
15466,2012-05-21,britishcolumbia,4
15467,2013-09-07,britishcolumbia,5
15468,2011-09-10,Quebec,2


In [20]:
import datetime

raw_df["date"][0].year



AttributeError: 'Timestamp' object has no attribute 'dt'

In [None]:
type(raw_df

In [26]:
# TODO - Complete this method
def process_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Process the raw DataFrame:

    1. Keep only the 'Canada' subreddit
    2. Keep only the years 2012 and 2013
    3. Add a new column 'is_weekend' with a Boolean value of True/False

    Args:
    df(pd.DataFrame): Dataframe to process; contains the columns
    'date', 'subreddit', 'comment_count' by default

    Returns:
    pd.DataFrame: Must have at least the columns: 'comment_count', 'date', 'is_weekend'
    """
    df = df.copy()  #copy so that you don't modify the original dataframe
    df = df[df['subreddit'] == "canada"]
    
    df = df[df['date'].dt.year.isin([2012, 2013])]
    df['is_weekend'] = df['date'].dt.weekday.isin([5,6])
    # TODO: Filtrez sur years, subreddit, et ajoutez une colonne boolean 'is_weekend' 
    return df

In [33]:
raw_df

Unnamed: 0,date,subreddit,comment_count
0,2012-02-20,newfoundland,7
1,2015-01-26,Manitoba,1
2,2013-09-07,Yukon,2
3,2014-02-15,saskatchewan,5
4,2014-07-06,canada,1652
...,...,...,...
15465,2012-05-21,Quebec,365
15466,2012-05-21,britishcolumbia,4
15467,2013-09-07,britishcolumbia,5
15468,2011-09-10,Quebec,2


In [35]:
wd=other[other["weekend"]==False]

In [36]:
we=other[other["weekend"]==True]

In [43]:
sp.ttest_ind(wd["comment_count"], we["comment_count"]).pvalue

np.float64(1.3005502847208094e-58)

In [42]:
sp.normaltest(wd["comment_count"])

NormaltestResult(statistic=np.float64(32.21804641032879), pvalue=np.float64(1.0091137251707994e-07))

In [44]:
sp.fligner(wd["comment_count"], we["comment_count"]).pvalue

np.float64(0.06604578026604031)

In [45]:
# TODO - Complétez cette méthode
def tests(wd: pd.DataFrame, we: pd.DataFrame, verbose: bool = False) -> Tuple[float, float, float, float]:
    """
    Performs a t-test between the two inputs, checking whether the mean of the two distributions is
    the same. It also checks whether the two input data sets have a normal distribution and the
    same variance (a requirement for the t-test).

    Reference: https://docs.scipy.org/doc/scipy/reference/stats.html#statistical-tests

    Arguments:
    wd (pd.DataFrame): weekday data
    we (pd.DataFrame): weekend data
    verbose (bool): Whether to display the results

    Returns:
    Tuple[float, float, float, float]: p_test, p_wd_isnormal, p_we_isnormal, p_vartest
    """
    p_ttest, p_wd_normal, p_we_normal, p_vartest = None, None, None, None

    p_ttest=sp.ttest_ind(wd["comment_count"], we["comment_count"]).pvalue
    p_wd_normal=sp.normaltest(wd["comment_count"]).pvalue
    p_we_normal=sp.normaltest(we["comment_count"]).pvalue
    p_vartest=sp.fligner(wd["comment_count"], we["comment_count"]).pvalue
    
    # TODO: Get the p-value for the t-test

    # TODO: Get the p-value for the normality test on the weekday and weekend data separately
    # That is, are both distributions normal?

    # TODO: Get the p-value for the test that checks whether these two distributions have the same variance
    # ---------- DO NOT CHANGE THE FUNCTION BELOW THIS LINE ---------- #

    if verbose:
        print(f"p_value:\t{p_ttest.round(5)}")
        print(f"WD normality:\t{p_wd_normal.round(5)}")
        print(f"WE normality:\t{p_we_normal.round(5)}")
        print(f"Variance test:\t{p_vartest.round(5)}")

    return p_ttest, p_wd_normal, p_we_normal, p_vartest

In [46]:
tests(wd,we,True)

p_value:	0.0
WD normality:	0.0
WE normality:	0.00152
Variance test:	0.06605


(np.float64(1.3005502847208094e-58),
 np.float64(1.0091137251707994e-07),
 np.float64(0.0015209196859635404),
 np.float64(0.06604578026604031))

In [None]:
# TODO - Complete this mewthod
def central_limit_theorem(df: pd.DataFrame) -> pd.DataFrame:
    """
    Combine all weekdays and weekends for each year/week pair and take the average of their
    (untransformed) counts.

    Tip: You can get a "year" and a "week number" from the first two values ​​returned
    by date.isocalendar(). This year and week number will give you an identifier for the (year, week) pair.
    Use Pandas to group by this value and aggregate by taking the average.

    Note: The year returned by isocalendar is not always the same as the year of the date (around New
    Year's Day). Use the year from the isocalendar, which is correct in this case. This is different from the
    year you used to filter events; do not perform any additional filtering!

    Arguments:
    df(pd.DataFrame): Cleaned dataframe containing (at least) the columns: 'date', 'comment_count', 'is_weekend'

    Returns:
    pd.DataFrame: Must have (at least) the columns: 'comment_count', 'is_weekend'
     """
    df = df.copy()

    # TODO: Combine all weekdays and weekends of each year/week pair and take the average of their (untransformed) counts.
    clt: pd.DataFrame = None 

    
    return clt