# Removing duplicates
For now, the data from the two databases from the two Technical Debt Dataset versions has only been combined. However, there are now duplicated SonarQube analysis, that need to be cleaned.

In [12]:
import pandas as pd
import os

In [13]:
# data import
current_dir = os.getcwd()

# construct path to the project data folder
data_dir = os.path.join(current_dir, '..', '..', 'Data','Sonar_Measures')

# load SonarQube measure data
df = pd.read_csv(os.path.join(data_dir, 'sonar_measures_v1_v2.csv'), low_memory = False)
df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,,,,,,,,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144635,,org.apache:zookeeper,2014-07-25 16:23:40,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144636,,org.apache:zookeeper,2014-07-24 23:31:57,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144637,,org.apache:zookeeper,2014-07-24 23:16:09,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144638,,org.apache:zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,


## Approach
To identify duplicated analysis, we need the PROJECT_ID and the SQ_ANALYSIS_DATE. If both of columns are identical between different rows in the dataset, it is sure that they're duplicates. This is because for one project, there can't be two different analysis at the same time.

## Cleaning the PROJECT_IDs
The two databases have a different naming scheme for their variables. For version 1, the PROJECT_ID is the name of the project. For version 2, the PROJECT_ID contains the name with an 'org.apache:' added in front of the name. Since having only the name is nicer for analysis, the 'org.apache:' prefix is removed from the PROJECT_ID.

In [14]:
print(set(df["PROJECT_ID"].to_list()))

{'org.apache:net', 'org.apache:ognl', 'zookeeper', 'accumulo', 'commons-net', 'org.apache:hive', 'cocoon', 'commons-bcel', 'org.apache:collections', 'org.apache:jxpath', 'commons-jxpath', 'commons-collections', 'commons-codec', 'commons-fileupload', 'commons-validator', 'commons-exec', 'httpcomponents-core', 'commons-cli', 'org.apache:codec', 'org.apache:configuration', 'commons-jelly', 'org.apache:santuario', 'org.apache:validator', 'org.apache:vfs', 'felix', 'org.apache:thrift', 'org.apache:daemon', 'org.apache:zookeeper', 'org.apache:batik', 'org.apache:digester', 'org.apache:bcel', 'httpcomponents-client', 'ambari', 'org.apache:httpclient', 'org.apache:cocoon', 'beam', 'org.apache:dbutils', 'org.apache:commons-fileupload', 'org.apache:cayenne', 'org.apache:commons-exec', 'mina-sshd', 'org.apache:httpcore', 'atlas', 'santuario', 'commons-io', 'org.apache:commons-cli', 'commons-digester', 'commons-daemon', 'org.apache:commons-io', 'org.apache:dbcp', 'commons-jexl', 'batik', 'org.apac

In [15]:
df['PROJECT_ID'] = df['PROJECT_ID'].str.replace('org.apache:', '', regex=False)
df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,,,,,,,,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144635,,zookeeper,2014-07-25 16:23:40,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144636,,zookeeper,2014-07-24 23:31:57,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144637,,zookeeper,2014-07-24 23:16:09,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144638,,zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,


In [16]:
df = df.drop_duplicates(subset=['PROJECT_ID', 'SQ_ANALYSIS_DATE'])
df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,,,,,,,,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144635,,zookeeper,2014-07-25 16:23:40,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144636,,zookeeper,2014-07-24 23:31:57,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
144637,,zookeeper,2014-07-24 23:16:09,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144638,,zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,


After dropping duplicates, there are ~400 rows less. One of the projects which showed to be in part duplicated in both database versions is zookeeper. Next it is checked if zookeeper looks clean.

In [17]:
df[df['PROJECT_ID'] == 'zookeeper'].sort_values(by='SQ_ANALYSIS_DATE').head(-50)

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
144639,,zookeeper,2014-07-23 18:17:17,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
77518,16e14c8d6b3c10d4697fab51c74cb4a7640f1b4f,zookeeper,2014-07-23T18:17:17Z,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144638,,zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144637,,zookeeper,2014-07-24 23:16:09,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144636,,zookeeper,2014-07-24 23:31:57,733.0,424.0,4233.0,9304.0,13.2,9031.0,21.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77874,d857e042fe4d188a7ba750ea6947f93f39cb7ed1,zookeeper,2017-12-06T22:14:10Z,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,
77875,665038e0698cd471a25b7ead4c8a50f767a3e7ed,zookeeper,2017-12-06T22:54:44Z,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,
77876,dcfbe45241855f2caccf3848be2e66e7aa23aa96,zookeeper,2017-12-12T18:39:45Z,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,
77877,9e30b9bf8fac56db3846c8cc42997cdc23a9358d,zookeeper,2017-12-12T18:44:32Z,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,


Zookeeper has different formatting for SQ_ANALYSIS_DATE between the two database versions. Therefore, the duplicates haven't been properly found out.

In [18]:
def standardize_date(date_str):
    """This function standardizes values in SQ_ANALYSIS_DATE from 2014-08-14T12:33:18Z format to 2014-08-14 12:33:18."""
    if 'T' in date_str:
        date_str = date_str.replace('T', ' ').replace('Z', '')
    return date_str

df['SQ_ANALYSIS_DATE'] = df['SQ_ANALYSIS_DATE'].apply(standardize_date)
df[df['PROJECT_ID'] == 'zookeeper'].sort_values(by='SQ_ANALYSIS_DATE').head(-50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['SQ_ANALYSIS_DATE'] = df['SQ_ANALYSIS_DATE'].apply(standardize_date)


Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
77518,16e14c8d6b3c10d4697fab51c74cb4a7640f1b4f,zookeeper,2014-07-23 18:17:17,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144639,,zookeeper,2014-07-23 18:17:17,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144638,,zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
77519,4fcc59ff15bfd7e456ae342581eff330923edac5,zookeeper,2014-07-24 22:58:38,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
144637,,zookeeper,2014-07-24 23:16:09,732.0,423.0,4230.0,9302.0,13.2,9029.0,21.5,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
77874,d857e042fe4d188a7ba750ea6947f93f39cb7ed1,zookeeper,2017-12-06 22:14:10,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,
77875,665038e0698cd471a25b7ead4c8a50f767a3e7ed,zookeeper,2017-12-06 22:54:44,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,
77876,dcfbe45241855f2caccf3848be2e66e7aa23aa96,zookeeper,2017-12-12 18:39:45,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,
77877,9e30b9bf8fac56db3846c8cc42997cdc23a9358d,zookeeper,2017-12-12 18:44:32,834.0,523.0,5015.0,11304.0,13.5,10545.0,20.3,...,,,,,,,,,,


Now, all dates in zookeeper have the same format. Now the dopping of duplicates is repeated to get rid of potential duplicate analysis.

In [19]:
df = df.drop_duplicates(subset=['PROJECT_ID', 'SQ_ANALYSIS_DATE'])
df

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
0,e0880e263e4bf8662ba3848405200473a25dfc9f,accumulo,2008-07-07 14:52:05,2108.0,1103.0,17295.0,13509.0,6.2,43137.0,40.6,...,,,,,,,,,,
1,e8774c5ec3a35e042f320540b5f7e66ebd2d9e87,accumulo,2008-07-07 12:31:47,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
2,2032ebbd0ed90734da39ca238bbd10dee24d0030,accumulo,2008-07-05 18:54:27,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
3,de297d4932e08625a5df146f0802041bb5aeb892,accumulo,2008-07-03 20:21:40,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
4,34efaae87639a83b60fdb7274de4b45051025a3a,accumulo,2008-07-02 00:12:36,2108.0,1103.0,17295.0,13507.0,6.2,43137.0,40.6,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144413,,vfs,2002-08-20 06:10:50,69.0,65.0,425.0,1536.0,24.2,690.0,10.6,...,,,,,,,,,,
144414,,vfs,2002-08-20 02:57:02,69.0,65.0,422.0,1533.0,24.1,693.0,10.7,...,,,,,,,,,,
144415,,vfs,2002-07-19 11:54:15,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,,,,,,,,,,
144416,,vfs,2002-07-18 16:47:24,69.0,65.0,421.0,1513.0,24.1,687.0,10.6,...,,,,,,,,,,


Another ~4000 rows have been dropped.

In [20]:
df["PROJECT_ID"].unique().tolist()

['accumulo',
 'ambari',
 'atlas',
 'aurora',
 'batik',
 'beam',
 'cocoon',
 'commons-bcel',
 'commons-beanutils',
 'commons-cli',
 'commons-codec',
 'commons-collections',
 'commons-configuration',
 'commons-daemon',
 'commons-dbcp',
 'commons-dbutils',
 'commons-digester',
 'commons-exec',
 'commons-fileupload',
 'commons-io',
 'commons-jelly',
 'commons-jexl',
 'commons-jxpath',
 'commons-net',
 'commons-ognl',
 'commons-validator',
 'commons-vfs',
 'felix',
 'httpcomponents-client',
 'httpcomponents-core',
 'mina-sshd',
 'santuario',
 'zookeeper',
 'cayenne',
 'archiva',
 'bcel',
 'beanutils',
 'codec',
 'collections',
 'configuration',
 'daemon',
 'dbcp',
 'dbutils',
 'digester',
 'hive',
 'httpclient',
 'httpcore',
 'jxpath',
 'net',
 'ognl',
 'thrift',
 'validator',
 'vfs']

In [21]:
df[df['PROJECT_ID'] == 'jxpath'].sort_values(by='SQ_ANALYSIS_DATE').head(-50)

Unnamed: 0,COMMIT_HASH,PROJECT_ID,SQ_ANALYSIS_DATE,CLASSES,FILES,FUNCTIONS,COMMENT_LINES,COMMENT_LINES_DENSITY,COMPLEXITY,FILE_COMPLEXITY,...,sg_i.JAVA_CYCLIC_PACKAGES_PERCENT,sg_i.MAX_MODULE_NCCD,sg_i.ARCHITECTURE_FEATURE_AVAILABLE,sg_i.NUMBER_OF_ISSUES,sg_i.NUMBER_OF_CRITICAL_ISSUES_WITHOUT_RESOLUTION,sg_i.VIOLATING_COMPONENTS_PERCENT,sg_i.UNASSIGNED_COMPONENTS_PERCENT,sg_i.NUMBER_OF_THRESHOLD_VIOLATIONS,sg_i.NUMBER_OF_WORKSPACE_WARNINGS,sg_i.NUMBER_OF_IGNORED_CRITICAL_ISSUES
133639,,jxpath,2001-08-23 03:38:00,104.0,101.0,1140.0,2012.0,13.2,4541.0,45.0,...,,,,,,,,,,
133638,,jxpath,2001-09-03 01:22:31,115.0,112.0,1226.0,2108.0,13.1,4725.0,42.2,...,,,,,,,,,,
133637,,jxpath,2001-09-08 20:59:58,115.0,112.0,1230.0,2114.0,13.1,4733.0,42.3,...,,,,,,,,,,
133636,,jxpath,2001-09-08 21:01:00,125.0,122.0,1258.0,2203.0,13.4,4773.0,39.1,...,,,,,,,,,,
133635,,jxpath,2001-09-09 00:52:04,125.0,122.0,1260.0,2227.0,13.5,4782.0,39.2,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133098,,jxpath,2012-01-21 18:19:43,242.0,225.0,2205.0,5112.0,17.3,6019.0,26.8,...,,,,,,,,,,
133097,,jxpath,2012-02-24 20:54:12,242.0,225.0,2205.0,5112.0,17.3,6019.0,26.8,...,,,,,,,,,,
133096,,jxpath,2012-03-17 05:09:23,242.0,225.0,2205.0,5112.0,17.3,6019.0,26.8,...,,,,,,,,,,
133095,,jxpath,2013-01-17 22:02:44,242.0,225.0,2205.0,5112.0,17.3,6019.0,26.8,...,,,,,,,,,,


## Result
The formatting of the date column is now consistent over the two database versions and duplicates have been removed from the data. More than 140000 unique analysis remain.

In [22]:
df.to_csv(os.path.join(data_dir, 'sonar_measures_v1_v2_no_duplicates.csv'), index = False)