Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upload files in chunks (#6510) #6531

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 6 additions & 1 deletion extensions/wikibase/pom.xml
Expand Up @@ -164,7 +164,12 @@
<scope>test</scope>
</dependency>


<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>${commons-io.version}</version>
</dependency>
</dependencies>

</project>

@@ -1,9 +1,15 @@

package org.openrefine.wikibase.editing;

import java.io.Closeable;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.URL;
import java.nio.file.Files;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
Expand All @@ -15,6 +21,7 @@
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.JsonNode;
import org.apache.commons.io.input.BoundedInputStream;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.wikidata.wdtk.datamodel.helpers.Datamodel;
import org.wikidata.wdtk.datamodel.interfaces.MediaInfoIdValue;
Expand Down Expand Up @@ -89,6 +96,12 @@ public void purgePage(long pageid) throws IOException, MediaWikiApiErrorExceptio
*/
public MediaUploadResponse uploadLocalFile(File path, String fileName, String wikitext, String summary, List<String> tags)
throws IOException, MediaWikiApiErrorException {
if (path.length() > 100000000) {
try (ChunkedFile chunkedFile = new ChunkedFile(path)) {
return uploadLocalFileChunked(chunkedFile, fileName, wikitext, summary, tags);
}
}

Map<String, String> parameters = new HashMap<>();
parameters.put("action", "upload");
parameters.put("tags", String.join("|", tags));
Expand All @@ -102,6 +115,63 @@ public MediaUploadResponse uploadLocalFile(File path, String fileName, String wi
return uploadFile(parameters, files);
}

/**
* Upload a local file to the MediaWiki instance in chunks.
*
* @param path
* ChunkedFile of the local file
* @param fileName
* its filename once stored on the wiki
* @param wikitext
* the accompanying wikitext for the file
* @param summary
* the edit summary associated with the upload
* @param tags
* tags to apply to the edit
* @return
* @throws IOException
* @throws MediaWikiApiErrorException
*/
protected MediaUploadResponse uploadLocalFileChunked(ChunkedFile path, String fileName, String wikitext, String summary,
List<String> tags)
throws IOException, MediaWikiApiErrorException {
MediaUploadResponse response = null;
int i = 1;
for (File chunk = path.readChunk(); chunk != null; chunk = path.readChunk()) {
Map<String, String> parameters = new HashMap<>();
parameters.put("action", "upload");
parameters.put("token", getCsrfToken());
parameters.put("stash", "1");
parameters.put("filename", fileName);
parameters.put("filesize", String.valueOf(path.getLength()));
if (response == null) {
// In the first request we don't have offset or file key.
parameters.put("offset", "0");
} else {
parameters.put("offset", String.valueOf(response.offset));
parameters.put("filekey", response.filekey);
}
Map<String, ImmutablePair<String, java.io.File>> files = new HashMap<>();
String chunkName = "chunk-" + i + path.getExtension();
files.put("chunk", new ImmutablePair<String, File>(chunkName, chunk));
response = uploadFile(parameters, files);
chunk.delete();
response.checkForErrors();
i++;
}

Map<String, String> parameters = new HashMap<>();
parameters.put("action", "upload");
parameters.put("token", getCsrfToken());
parameters.put("filename", fileName);
parameters.put("filekey", response.filekey);
parameters.put("tags", String.join("|", tags));
parameters.put("comment", summary);
parameters.put("text", wikitext);

return uploadFile(parameters, null);
}

/**
* Upload a file that the MediaWiki server fetches directly from the supplied URL. The URL domain must likely be
* whitelisted before.
Expand Down Expand Up @@ -261,6 +331,10 @@ public static class MediaUploadResponse {
public String filename;
@JsonProperty("pageid")
public long pageid;
@JsonProperty("offset")
public long offset;
@JsonProperty("filekey")
public String filekey;
@JsonProperty("warnings")
public Map<String, JsonNode> warnings;

Expand All @@ -273,12 +347,17 @@ public static class MediaUploadResponse {
* @throws MediaWikiApiErrorException
*/
public void checkForErrors() throws MediaWikiApiErrorException {
if ("Continue".equals(result)) {
return;
}

if (!"Success".equals(result)) {
throw new MediaWikiApiErrorException(result,
"The file upload action returned the '" + result + "' error code. Warnings are: " + Objects.toString(warnings));
}
if (filename == null) {
throw new MediaWikiApiErrorException(result, "The MediaWiki API did not return any filename for the uploaded file");
if (filename == null && filekey == null) {
throw new MediaWikiApiErrorException(result,
"The MediaWiki API did not return any filename or filekey for the uploaded file");
}
}

Expand Down Expand Up @@ -306,4 +385,78 @@ public MediaInfoIdValue getMid(ApiConnection connection, String siteIri) throws
return mid;
}
}

/**
* A file read one chunk at a time.
*/

public static class ChunkedFile implements Closeable {

protected FileInputStream stream;
protected final int chunkSize = 5000;
protected File path;
protected long bytesRead;
protected int chunksRead;

public ChunkedFile(File path) throws FileNotFoundException {
this.path = path;
stream = new FileInputStream(path);
bytesRead = 0;
chunksRead = 0;
}

/**
* Read the next chunk of the file.
*
* @return {File} Contains a chunk of the original file. The length in bytes is chunkSize or however much
* remains of the file if the last chunk is read.
* @throws IOException
*/
public File readChunk() throws IOException {
if (bytesRead >= path.length()) {
return null;
}

String fileName = "chunk-" + chunksRead + "-";
BoundedInputStream inStream = BoundedInputStream.builder()
.setInputStream(stream)
.setMaxCount(chunkSize)
.get();
File chunk = Files.createTempFile(fileName, getExtension()).toFile();
OutputStream outStream = new FileOutputStream(chunk);
bytesRead += inStream.transferTo(outStream);
chunksRead++;

return chunk;
}

/**
* Get length of the file.
*
* @see File#length() length
* @return {long}
*/
public long getLength() {
return path.length();
}

/**
* Get the extension from the filename.
*
* @return {String} The file extensions, including the dot. If the file has no extensions, the empty string.
*/
public String getExtension() {
int lastDotIndex = path.getName().lastIndexOf(".");
if (lastDotIndex == -1) {
return "";
}

return path.getName().substring(lastDotIndex);
}

@Override
public void close() throws IOException {
stream.close();
}
}
}
Expand Up @@ -3,6 +3,7 @@

import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.eq;
import static org.mockito.Mockito.inOrder;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.times;
import static org.mockito.Mockito.verify;
Expand All @@ -13,6 +14,8 @@
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
Expand All @@ -21,6 +24,7 @@

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import org.apache.commons.lang3.tuple.ImmutablePair;
import org.mockito.InOrder;
import org.mockito.Mockito;
import org.testng.annotations.Test;
Expand All @@ -33,6 +37,7 @@

import com.google.refine.util.ParsingUtilities;

import org.openrefine.wikibase.editing.MediaFileUtils.ChunkedFile;
import org.openrefine.wikibase.editing.MediaFileUtils.MediaUploadResponse;

public class MediaFileUtilsTest {
Expand Down Expand Up @@ -332,4 +337,94 @@ protected void mockCsrfCall(ApiConnection connection) throws IOException, MediaW
JsonNode tokenJsonResponse = ParsingUtilities.mapper.readTree(csrfResponse);
when(connection.sendJsonRequest("POST", tokenParams)).thenReturn(tokenJsonResponse);
}

@Test
public void testUploadLocalFileChunked() throws IOException, MediaWikiApiErrorException {
ApiConnection connection = mock(ApiConnection.class);
// mock CSRF token request
mockCsrfCall(connection);

ChunkedFile chunkedFile = mock(ChunkedFile.class);
when(chunkedFile.getLength()).thenReturn(10001L);
when(chunkedFile.getExtension()).thenReturn(".png");
Path firstChunk = Files.createTempFile("chunk-1-", ".png");
Path secondChunk = Files.createTempFile("chunk-2-", ".png");
Path thirdChunk = Files.createTempFile("chunk-3-", ".png");
when(chunkedFile.readChunk())
.thenReturn(firstChunk.toFile())
.thenReturn(secondChunk.toFile())
.thenReturn(thirdChunk.toFile())
.thenReturn(null);

// Initialise the upload and upload the first chunk.
Map<String, String> firstParams = new HashMap<>();
firstParams.put("action", "upload");
firstParams.put("filename", "My_test_file.png");
firstParams.put("stash", "1");
firstParams.put("filesize", "10001");
firstParams.put("offset", "0");
firstParams.put("token", csrfToken);
String firstResponseString = "{\"upload\":{\"offset\":5000,\"result\":\"Continue\",\"filekey\":\"filekey.1234.png\"}}";
JsonNode firstResponse = ParsingUtilities.mapper.readTree(firstResponseString);
Map<String, ImmutablePair<String, java.io.File>> firstFiles = new HashMap<>();
firstFiles.put("chunk", new ImmutablePair<String, File>("chunk-1.png", firstChunk.toFile()));
when(connection.sendJsonRequest(eq("POST"), eq(firstParams), eq(firstFiles))).thenReturn(firstResponse);

// Upload the second chunk.
Map<String, String> secondParams = new HashMap<>();
secondParams.put("action", "upload");
secondParams.put("filename", "My_test_file.png");
secondParams.put("stash", "1");
secondParams.put("filesize", "10001");
secondParams.put("offset", "5000");
secondParams.put("filekey", "filekey.1234.png");
secondParams.put("token", csrfToken);
String secondResponseString = "{\"upload\":{\"offset\":10000,\"result\":\"Continue\",\"filekey\":\"filekey.1234.png\"}}";
JsonNode secondResponse = ParsingUtilities.mapper.readTree(secondResponseString);
Map<String, ImmutablePair<String, java.io.File>> secondFiles = new HashMap<>();
secondFiles.put("chunk", new ImmutablePair<String, File>("chunk-2.png", secondChunk.toFile()));
when(connection.sendJsonRequest(eq("POST"), eq(secondParams), eq(secondFiles))).thenReturn(secondResponse);

// Upload the third and final chunk.
Map<String, String> thirdParams = new HashMap<>();
thirdParams.put("action", "upload");
thirdParams.put("filename", "My_test_file.png");
thirdParams.put("stash", "1");
thirdParams.put("filesize", "10001");
thirdParams.put("offset", "10000");
thirdParams.put("filekey", "filekey.1234.png");
thirdParams.put("token", csrfToken);
String thirdResponseString = "{\"upload\":{\"offset\":10001,\"result\":\"Continue\",\"filekey\":\"filekey.1234.png\"}}";
JsonNode thirdResponse = ParsingUtilities.mapper.readTree(
thirdResponseString);
Map<String, ImmutablePair<String, java.io.File>> thirdFiles = new HashMap<>();
thirdFiles.put("chunk", new ImmutablePair<String, File>("chunk-3.png", thirdChunk.toFile()));
when(connection.sendJsonRequest(eq("POST"), eq(thirdParams), eq(thirdFiles))).thenReturn(thirdResponse);

// Finalise the upload.
Map<String, String> finalParams = new HashMap<>();
finalParams.put("action", "upload");
finalParams.put("filename", "My_test_file.png");
finalParams.put("filekey", "filekey.1234.png");
finalParams.put("tags", "");
finalParams.put("comment", "my summary");
finalParams.put("text", "my wikitext");
finalParams.put("token", csrfToken);
JsonNode finalResponse = ParsingUtilities.mapper.readTree(successfulUploadResponse);
when(connection.sendJsonRequest(eq("POST"), eq(finalParams), eq(null))).thenReturn(finalResponse);

MediaFileUtils mediaFileUtils = new MediaFileUtils(connection);
MediaUploadResponse response = mediaFileUtils.uploadLocalFileChunked(chunkedFile, "My_test_file.png", "my wikitext", "my summary",
Collections.emptyList());

InOrder inOrder = inOrder(connection);
inOrder.verify(connection).sendJsonRequest(eq("POST"), eq(firstParams), eq(firstFiles));
inOrder.verify(connection).sendJsonRequest(eq("POST"), eq(secondParams), eq(secondFiles));
inOrder.verify(connection).sendJsonRequest(eq("POST"), eq(thirdParams), eq(thirdFiles));
inOrder.verify(connection).sendJsonRequest(eq("POST"), eq(finalParams), eq(null));
assertEquals(response.filename, "My_test_file.png");
assertEquals(response.pageid, 12345L);
assertEquals(response.getMid(connection, Datamodel.SITE_WIKIMEDIA_COMMONS),
Datamodel.makeWikimediaCommonsMediaInfoIdValue("M12345"));
}
};;