In [18]:
from faker import Faker
import pandas as pd 
import random as rnd 
from datetime import datetime, timedelta
import shutil

In [19]:
# --- Configuration ---
Faker.seed(42)
fake = Faker()

In [20]:
student_fullname = "RATSIHOARANA Nomenahitantsoa Amy Andriamalala"

In [21]:
# --- Fonction erreurs de formatage ---
def apply_format_errors(df, col):
    for i in df.index:
        r = rnd.random()
        val = str(df.at[i, col])
        if r < 0.3:       # 30 % majuscules + espaces
            df.at[i, col] = "  " + val.upper() + "  "
        elif r < 0.5:     # 20 % minuscules
            df.at[i, col] = val.lower()
        # 50 % reste inchangé
    return df

In [None]:
e
for pid in range(1, 41):  # 40 projets uniques
    start_date = fake.date_between(start_date='-2y', end_date='-1y')
    end_date = fake.date_between(start_date=start_date, end_date='today')
    
    projects.append({
        "projectid": pid,
        "student_fullname": student_fullname,
        "name": fake.catch_phrase(),                 # cohérent + réaliste
        "client": fake.company(),
        "startdate": start_date,
        "enddate": end_date,
        "version": fake.semver(),                    # version cohérente
    })

# Ajout de 20 doublons : même données sauf ID
indices = rnd.sample(range(40), 20)
for i in indices:
    dup = projects[i].copy()
    dup["projectid"] = len(projects) + 1
    projects.append(dup)



df_projects = pd.DataFrame(projects)
df_projects = apply_format_errors(df_projects, "name")
df_projects = apply_format_errors(df_projects, "client")

print(df_projects)


    projectid                               student_fullname            name  \
0           1  RATSIHOARANA Nomenahitantsoa Amy Andriamalala      issueforge   
1           2  RATSIHOARANA Nomenahitantsoa Amy Andriamalala      SquashFlow   
2           3  RATSIHOARANA Nomenahitantsoa Amy Andriamalala      BUGPULSE     
3           4  RATSIHOARANA Nomenahitantsoa Amy Andriamalala     TRACKNEST     
4           5  RATSIHOARANA Nomenahitantsoa Amy Andriamalala       errorgrid   
5           6  RATSIHOARANA Nomenahitantsoa Amy Andriamalala        FixRadar   
6           7  RATSIHOARANA Nomenahitantsoa Amy Andriamalala       DebugDock   
7           8  RATSIHOARANA Nomenahitantsoa Amy Andriamalala     TRACEHIVE     
8           9  RATSIHOARANA Nomenahitantsoa Amy Andriamalala        PatchBay   
9          10  RATSIHOARANA Nomenahitantsoa Amy Andriamalala        bugpulse   
10         11  RATSIHOARANA Nomenahitantsoa Amy Andriamalala        FixRadar   
11         12  RATSIHOARANA Nomenahitant

In [23]:
# --- TABLE DEVELOPER ---
specialties = ["Frontend", "Backend", "Fullstack", "DevOps", "QA", "Security"]
developers = []

for devid in range(40):
    name = fake.name()
    name_parts = name.lower().replace(".", "").replace("'", "").split()
    email = f"{'.'.join(name_parts)}@gmail.com"
    specialty = rnd.choice(specialties)
    developers.append({
        "devid": devid + 1,
        "student_fullname": student_fullname,
        "name": name,
        "email": email,
        "specialty": specialty
    })

original_indices = rnd.sample(range(40), 20)
for i in original_indices:
    duplicate = developers[i].copy()
    developers.append(duplicate)

df_developers = pd.DataFrame(developers)
df_developers = apply_format_errors(df_developers, "name")
df_developers = apply_format_errors(df_developers, "specialty")

In [24]:
# --- TABLE BUG ---
bug_titles = [
    "Login button not responding on mobile",
    "Database connection timeout after 30 seconds",
    "User profile image not uploading",
    "Password reset email not sent",
    "Dashboard loads slowly with large datasets",
]
bug_descriptions = [
    "Steps to reproduce: 1. Navigate to login page 2. Click login button 3. Nothing happens",
    "The application loses connection to the database after exactly 30 seconds of inactivity",
    "When users try to upload a profile picture, the upload progress bar reaches 100% but image never appears",
]
severity_options = ["Low", "Medium", "High", "Critical"]
status_options = ["Open", "In Progress", "Fixed", "Closed"]

bugs = []
for bugid in range(70):
    project_index = rnd.randint(0, len(df_projects)-1)
    title = rnd.choice(bug_titles)
    bugs.append({
        "bugid": bugid + 1,
        "student_fullname": student_fullname,
        "projectid": project_index + 1,
        "title": title,
        "description": rnd.choice(bug_descriptions),
        "severity": rnd.choice(severity_options),
        "status": rnd.choice(status_options),
        "createdby": fake.name()
    })

original_indices = rnd.sample(range(70), 30)
for i in original_indices:
    duplicate = bugs[i].copy()
    bugs.append(duplicate)

df_bugs = pd.DataFrame(bugs)
df_bugs = apply_format_errors(df_bugs, "title")
df_bugs = apply_format_errors(df_bugs, "severity")


In [25]:
# --- TABLE RELEASES ---
release_notes = [
    "Fixed critical security vulnerability",
    "Added new dashboard with real-time analytics",
    "Improved performance by 40%",
]
releases = []
for releaseid in range(60):
    project_index = rnd.randint(0, len(df_projects)-1)
    version = f"{rnd.randint(0,5)}.{rnd.randint(0,9)}.{rnd.randint(0,9)}"
    release_date = df_projects.iloc[project_index]["startdate"] + timedelta(days=rnd.randint(0,365))
    releases.append({
        "releaseid": releaseid, 
        "student_fullname": student_fullname,
        "projectid": project_index + 1,
        "version": version,
        "releasedate": release_date,
        "notes": rnd.choice(release_notes)
    })

original_indices = rnd.sample(range(60), 20)
for i in original_indices:
    duplicate = releases[i].copy()
    releases.append(duplicate)

df_releases = pd.DataFrame(releases)
df_releases = apply_format_errors(df_releases, "version")
df_releases = apply_format_errors(df_releases, "notes")

In [26]:
# --- TABLE BUGFIX ---
fix_descriptions = [
    "Updated authentication logic",
    "Optimized database query",
    "Fixed file upload handler",
]
bugfixes = []
for fixid in range(90):
    bug_index = rnd.randint(0, len(df_bugs)-1)
    dev_index = rnd.randint(0, len(df_developers)-1)
    bugfixes.append({
        "fixid": fixid + 1, 
        "student_fullname": student_fullname,
        "bugid": bug_index + 1,
        "devid": dev_index + 1,
        "fixdate": pd.Timestamp.now().date(),
        "description": rnd.choice(fix_descriptions)
    })

original_indices = rnd.sample(range(90), 30)
for i in original_indices:
    duplicate = bugfixes[i].copy()
    bugfixes.append(duplicate)

df_bugfix = pd.DataFrame(bugfixes)
df_bugfix = apply_format_errors(df_bugfix, "description")

In [27]:
# --- TABLE PROJECTDEVELOPER ---
project_developers = []
for _ in range(60):
    proj_index = rnd.randint(0, len(df_projects)-1)
    dev_index = rnd.randint(0, len(df_developers)-1)
    pd_entry = {
        "projectid": proj_index + 1,
        "devid": dev_index + 1,
        "student_fullname": student_fullname
    }
    project_developers.append(pd_entry)

df_projectdeveloper = pd.DataFrame(project_developers)
df_projectdeveloper = apply_format_errors(df_projectdeveloper, "student_fullname")


In [28]:
df_projects.to_csv("export_csv/projects.csv", index=False)
df_developers.to_csv("export_csv/developers.csv", index=False)
df_bugs.to_csv("export_csv/bugs.csv", index=False)
df_releases.to_csv("export_csv/releases.csv", index=False)
df_bugfix.to_csv("export_csv/bugfixes.csv", index=False)
df_projectdeveloper.to_csv("export_csv/project_developers.csv", index=False)