In [1]:
import sys
!{sys.executable} -m pip install meteostat



In [2]:
from datetime import datetime

from meteostat import Point, Daily
import pandas as pd

In [3]:
INPUT_FILE = "data/original/commits.txt"
COMMITS_CSV_FILE = "data/generated/commits.csv"
FILES_CSV_FILE = "data/generated/files.csv"
BAR_WEATHER_FILE = "data/generated/bar_weather.csv"

In [4]:
class FileInfo:
    def __init__(self, line):
        additions, deletions, self.file_path = line.split("\t")
        self.file_name = self.file_path.split("/")[-1]
        file_name_parts = self.file_name.split(".")
        if len(file_name_parts) > 1:
            self.extension = file_name_parts[-1]
        else:
            self.extension = None
        try:
            self.additions = int(additions)
        except:
            self.additions = None
        try:
            self.deletions = int(deletions)
        except:
            self.deletions = None

    def to_tuple(self):
        return (self.file_path, self.file_name, self.extension, self.additions, self.deletions)

    def columns():
        return ["file_path", "file_name", "extension", "additions", "deletions"]

In [5]:
class CommitInfo:
    def __init__(self, commit_raw_info, files_info):
        self.hash, self.author, self.date = commit_raw_info.strip().replace("<[", "").replace("]>", "").split(";")
        self.file_infos = [
            file_info
            for file_info in [FileInfo(line) for line in [line.strip() for line in files_info] if len(line) > 0]
            if "{" not in file_info.file_path and "}" not in file_info.file_path
        ]

    def to_df(self):
        df = pd.DataFrame.from_records([
            (self.hash, self.author, self.date, fi.extension, fi.additions, fi.deletions)
            for fi in self.file_infos
        ], columns=["hash", "author", "date", "extension", "additions", "deletions"])
        return df.groupby(["hash", "author", "date", "extension"])\
                 .sum().reset_index().sort_values(by=["additions", "deletions"], ascending=[False, False])

In [6]:
commit_infos = []
with open(INPUT_FILE, "r") as f:
    info = []
    for line in f:
        if line.startswith("<[") and line.endswith("]>\n"):
            if len(info) > 0:
                commit_infos.append(CommitInfo(info[0], info[1:]))
            info = [line]
        else:
            info.append(line)
    if len(info) > 0:
        commit_infos.append(CommitInfo(info[0], info[1:]))

In [7]:
pd.concat([
    commit_info.to_df()
    for commit_info in commit_infos
    if commit_info.date.split(" ")[-2] == "2024"
]).to_csv(COMMITS_CSV_FILE, index=False, sep=";")

In [8]:
data = [file_info.to_tuple() for commit_info in commit_infos for file_info in commit_info.file_infos]
df = pd.DataFrame.from_records(data, columns=FileInfo.columns())
agg_df = df.groupby(["file_path", "file_name", "extension"]).sum().reset_index().sort_values(by=["additions", "deletions"], ascending=[False, False])
filtered_df = agg_df[(~agg_df["file_path"].str.contains("=>"))&((agg_df["additions"]>0)|(agg_df["deletions"]>0))].reset_index(drop=True)
filtered_df.to_csv(FILES_CSV_FILE, index=False)

In [9]:
bar_weather_df = Daily(
    loc=Point(42.099998, 19.1),
    start=datetime(2024, 1, 1),
    end=datetime(2024, 12, 31)
).fetch().reset_index()
bar_weather_df.to_csv(BAR_WEATHER_FILE, index=False)