In [13]:
from faker import Faker
import pandas as pd 
import random as rnd 
from datetime import datetime, timedelta
import shutil

In [14]:
# --- Configuration ---
Faker.seed(42)
fake = Faker()

In [15]:
student_fullname = "RATSIHOARANA Nomenahitantsoa Amy Andriamalala"

In [26]:

def apply_format_errors(df, col, is_email=False):
    for i in df.index:
        val = str(df.at[i, col])
        r = rnd.random()
        
        # Cas général : maj/min/mix
        if r < 0.33:
            val = val.upper()
        elif r < 0.66:
            val = val.lower()
        else:
            val = "".join(rnd.choice([c.upper(), c.lower()]) for c in val)
        
        # Si colonne email, ajouter des erreurs aléatoires
        if is_email:
            err = rnd.random()
            if err < 0.1:          # 10 % enlever @
                val = val.replace("@", "")
            elif err < 0.2:        # 10 % enlever .
                val = val.replace(".", "")
            elif err < 0.25:       # 5 % ajouter un espace
                val = val.replace("@", " @")
            elif err < 0.3:        # 5 % remplacer un caractère aléatoire par un chiffre
                if len(val) > 3:
                    pos = rnd.randint(0, len(val)-1)
                    val = val[:pos] + str(rnd.randint(0,9)) + val[pos+1:]
        
        df.at[i, col] = val
    return df



In [None]:
projects = []
for pid in range(1, 41):  # 40 projets uniques
    start_date = fake.date_between(start_date='-2y', end_date='-1y')
    end_date = fake.date_between(start_date=start_date, end_date='today')
    
    projects.append({
        "projectid": pid,
        "student_fullname": student_fullname,
        "name": fake.catch_phrase(),                 # cohérent + réaliste
        "client": fake.company(),
        "startdate": start_date,
        "enddate": end_date,
        "version": f"{rnd.randint(0,5)}.{rnd.randint(0,9)}.{rnd.randint(0,9)}"
    })

# Ajout de 20 doublons : même données sauf ID
indices = rnd.sample(range(40), 20)
for i in indices:
    dup = projects[i].copy()
    dup["projectid"] = len(projects) + 1
    projects.append(dup)

df_projects = pd.DataFrame(projects)
df_projects = apply_format_errors(df_projects, "name")
df_projects = apply_format_errors(df_projects, "client")

print(df_projects)


In [27]:
# --- TABLE DEVELOPER ---
developers = []

for did in range(1, 41):  # 40 développeurs uniques
    name = fake.name()
    email = fake.email()  # email réaliste
    
    developers.append({
        "devid": did,
        "student_fullname": student_fullname,
        "name": name,
        "email": email,
        "specialty": fake.job(),   # métier cohérent
    })

# 20 doublons (mêmes données sauf ID)
indices = rnd.sample(range(40), 20)
for i in indices:
    dup = developers[i].copy()
    dup["devid"] = len(developers) + 1   # nouveau ID
    developers.append(dup)

df_developers = pd.DataFrame(developers)
df_developers = apply_format_errors(df_developers, "name")
df_developers = apply_format_errors(df_developers, "specialty")
df_developers = apply_format_errors(df_developers, "email", is_email=True)


In [19]:
severity_options = ["Low", "Medium", "High", "Critical"]
status_options = ["Open", "In Progress", "Fixed", "Closed"]

bugs = []

for bid in range(1, 71):  # 70 bugs uniques
    proj = rnd.choice(projects)  # un vrai projet existant
    
    bugs.append({
        "bugid": bid,
        "student_fullname": student_fullname,
        "projectid": proj["projectid"],     # cohérent
        "title": fake.sentence(nb_words=6), # titre cohérent
        "description": fake.paragraph(nb_sentences=3),
        "severity": rnd.choice(severity_options),
        "status": rnd.choice(status_options),
        "createdby": fake.name(),
    })

# 30 doublons (mêmes données sauf ID)
indices = rnd.sample(range(70), 30)
for i in indices:
    dup = bugs[i].copy()
    dup["bugid"] = len(bugs) + 1
    bugs.append(dup)

df_bugs = pd.DataFrame(bugs)
df_bugs = apply_format_errors(df_bugs, "title")
df_bugs = apply_format_errors(df_bugs, "severity")


In [20]:
releases = []

for releaseid in range(60):
    proj = rnd.choice(df_projects.to_dict(orient='records'))
    version = f"{fake.random_int(0,5)}.{fake.random_int(0,9)}.{fake.random_int(0,9)}"
    release_date = proj["startdate"] + timedelta(days=rnd.randint(0, 365))
    notes = fake.sentence(nb_words=8)

    releases.append({
        "releaseid": releaseid + 1,
        "student_fullname": student_fullname,
        "projectid": proj["projectid"],
        "version": version,
        "releasedate": release_date,
        "notes": notes
    })

original_indices = rnd.sample(range(60), 20)
for i in original_indices:
    duplicate = releases[i].copy()
    duplicate["releaseid"] = len(releases) + 1
    releases.append(duplicate)

df_releases = pd.DataFrame(releases)
df_releases = apply_format_errors(df_releases, "version")
df_releases = apply_format_errors(df_releases, "notes")

In [21]:
# --- TABLE BUGFIX ---
bugfixes = []

for fixid in range(90):
    bug = rnd.choice(df_bugs.to_dict(orient='records'))
    dev = rnd.choice(df_developers.to_dict(orient='records'))

    bugfixes.append({
        "fixid": fixid + 1,
        "student_fullname": student_fullname,
        "bugid": bug["bugid"],
        "devid": dev["devid"],
        "fixdate": fake.date_between(start_date="-1y", end_date="today"),
        "description": fake.sentence(nb_words=8)
    })

original_indices = rnd.sample(range(90), 30)
for i in original_indices:
    duplicate = bugfixes[i].copy()
    duplicate["fixid"] = len(bugfixes) + 1
    bugfixes.append(duplicate)

df_bugfix = pd.DataFrame(bugfixes)
df_bugfix = apply_format_errors(df_bugfix, "description")

In [24]:
# --- TABLE PROJECTDEVELOPER ---
from itertools import product
import random

# toutes les combinaisons possibles projet x développeur
all_combinations = list(product(df_projects["projectid"], df_developers["devid"]))

# mélanger les combinaisons
random.shuffle(all_combinations)

# choisir 60 combinaisons uniques
chosen_combinations = all_combinations[:60]

projectdevelopers = []
for projid, devid in chosen_combinations:
    projectdevelopers.append({
        "projectid": projid,
        "devid": devid,
        "student_fullname": student_fullname
    })

df_projectdeveloper = pd.DataFrame(projectdevelopers)
df_projectdeveloper = apply_format_errors(df_projectdeveloper, "student_fullname")




In [23]:
import os

output_dir = "../export_csv"
os.makedirs(output_dir, exist_ok=True)

df_projects.to_csv(os.path.join(output_dir, "projects.csv"), index=False)
df_developers.to_csv(os.path.join(output_dir, "developers.csv"), index=False)
df_bugs.to_csv(os.path.join(output_dir, "bugs.csv"), index=False)
df_releases.to_csv(os.path.join(output_dir, "releases.csv"), index=False)
df_bugfix.to_csv(os.path.join(output_dir, "bugfixes.csv"), index=False)
df_projectdeveloper.to_csv(os.path.join(output_dir, "project_developers.csv"), index=False)


In [28]:
df_developers.to_csv(os.path.join(output_dir, "developers.csv"), index=False)