# Explore dataset

## imports


In [20]:
import pandas as pd
import numpy as np
from statistics import mean
import json
from collections import Counter
import re


## get most common words


In [11]:
df = pd.read_csv('/content/drive/MyDrive/oreilly_january/huge_df.csv')

In [12]:
df['word_count'] = df['snippet'].apply(lambda x: len(x.split(' ')))

In [13]:
df['charecter_count'] = df['snippet'].apply(lambda x: len(x))

In [15]:
public = df[df['Status'] == 'public']
private = df[df['Status'] == 'private']


def extract_common_phrases(df, column_name='snippet', min_words=5, top_n=5):
    """
    Extracts the most common phrases with at least `min_words` words from the specified column of a DataFrame.

    Args:
    - df (pd.DataFrame): The input DataFrame.
    - column_name (str): The name of the column containing text data.
    - min_words (int): The minimum number of words a phrase must have to be considered.
    - top_n (int): The number of top phrases to return.

    Returns:
    - A DataFrame containing the most common phrases and their counts.
    """
    sub_df = df[df['Permutation_Number'] == 1]
    # Combine all text from the specified column
    all_text = ' '.join(sub_df[column_name].astype(str).tolist())


    # Tokenize into sentences and remove non-word characters
    sentences = re.split(r'[.!?]', all_text)
    sentences = [re.sub(r'[^a-zA-Z\s]', '', sentence).lower() for sentence in sentences]

    # Extract all possible phrases with at least min_words
    phrases = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) >= min_words:
            for i in range(len(words) - min_words + 1):
                phrase = ' '.join(words[i:i + min_words])
                phrases.append(phrase)

    # Count the most common phrases
    phrase_counts = Counter(phrases)

    # Get the top_n most common phrases
    most_common_phrases = phrase_counts.most_common(top_n)

    # Convert to DataFrame for display
    result_df = pd.DataFrame(most_common_phrases, columns=['Phrase', 'Count'])

    return result_df


extract_common_phrases(public, column_name='snippet', min_words=3, top_n=5)

Unnamed: 0,Phrase,Count
0,in this chapter,138
1,as well as,115
2,one of the,99
3,be able to,89
4,a lot of,85


In [16]:
extract_common_phrases(private, column_name='snippet', min_words=3, top_n=5)

Unnamed: 0,Phrase,Count
0,as well as,455
1,one of the,449
2,be able to,444
3,the number of,436
4,you want to,399


## get sample sizes of dataset

In [None]:
class Model:
  def __init__(self,model_name ,file_name, cutoff_date, color='blue'):
    self.name = model_name
    self.df =pd.read_csv(file_name)
    self.df['Date_Published'] = pd.to_datetime(self.df['date_published'])
    self.cutoff_date = pd.to_datetime([cutoff_date])[0]
    self.color = color

gpt4o = Model('GPT 4o', '/content/drive/MyDrive/oreilly_january/datasets/gpt-4o-2024-08-06.csv','2023-10-01','teal')# 10/1/2023
gpt4o_mini = Model('GPT 4o Mini', '/content/drive/MyDrive/oreilly_january/datasets/gpt-4o-mini.csv','2023-10-01','darkblue')# 10/1/2023
gpt_turbo = Model('GPT 3.5 Turbo', '/content/drive/MyDrive/oreilly_january/datasets/gpt-3.5-turbo-1106.csv','2021-09-01','purple')# 9/1/2021


In [None]:


for model in [gpt4o, gpt4o_mini, gpt_turbo]:
  df = model.df
  df['word_count'] = df['snippet'].apply(lambda x: len(x.split(' ')))

  # Convert 'date_published' to datetime
  df['date_published'] = pd.to_datetime(df['date_published'], errors='coerce')
  df = df[df['date_published'].dt.year != model.cutoff_date.year]

  # remove books used for
  df = df[~df['Title'].isin(['Prompt Engineering for Generative AI','Web API Cookbook'])]


  # Define the training period end date (for "in-dataset" vs "out of dataset")
  training_period_end_date = pd.Timestamp('2023-08-01')

  # Filter the data into 'in-dataset' and 'out of dataset'
  in_dataset = df[df['date_published'] < training_period_end_date]
  out_of_dataset = df[df['date_published'] > training_period_end_date]

  # Calculate descriptive statistics for each subset
  def calculate_descriptive_stats(df, label):
      num_paragraphs = len(df)
      avg_charecter_length = df['snippet'].apply(lambda x: len(x)).mean()
      avg_date_published = df['date_published'].mean()
      return {
          'Label': label,
          'Sample Size (n)': num_paragraphs // 24,
          # 'Average Charecter Length': round(avg_charecter_length),
          'Average Word Count': round(df['word_count'].mean()),
          # 'Average Date Published': avg_date_published.strftime('%m/%d/%Y')
      }

  # Descriptive statistics for the entire dataset
  # total_stats = calculate_descriptive_stats(df, 'Total')

  # Descriptive statistics for public and private
  public_stats = calculate_descriptive_stats(df[df['Status'] == 'public'], 'Public')
  private_stats = calculate_descriptive_stats(df[df['Status'] == 'private'], 'Private')

  # Descriptive statistics for 'in-dataset' and 'out of dataset'
  in_dataset_stats = calculate_descriptive_stats(in_dataset, 'In-Dataset')
  out_of_dataset_stats = calculate_descriptive_stats(out_of_dataset, 'Out-of-Dataset')

  # Combine results into a DataFrame for display
  descriptive_stats_df = pd.DataFrame([
      # total_stats,
      public_stats,
      private_stats,
      in_dataset_stats,
      out_of_dataset_stats
  ])

  # Display the descriptive statistics
  # import ace_tools as tools; tools.display_dataframe_to_user(name="Descriptive Statistics", dataframe=descriptive_stats_df)
  descriptive_stats_df
  print(model.name)
  print(descriptive_stats_df.to_latex(index=False, escape=False))

GPT 4o
\begin{tabular}{lrr}
\toprule
Label & Sample Size (n) & Average Word Count \\
\midrule
Public & 1965 & 112 \\
Private & 8997 & 113 \\
In-Dataset & 8985 & 113 \\
Out-of-Dataset & 1977 & 110 \\
\bottomrule
\end{tabular}

GPT 4o Mini
\begin{tabular}{lrr}
\toprule
Label & Sample Size (n) & Average Word Count \\
\midrule
Public & 1968 & 112 \\
Private & 9005 & 113 \\
In-Dataset & 8991 & 113 \\
Out-of-Dataset & 1982 & 110 \\
\bottomrule
\end{tabular}

GPT 3.5 Turbo
\begin{tabular}{lrr}
\toprule
Label & Sample Size (n) & Average Word Count \\
\midrule
Public & 1929 & 113 \\
Private & 6171 & 113 \\
In-Dataset & 5992 & 114 \\
Out-of-Dataset & 2108 & 111 \\
\bottomrule
\end{tabular}

